diff --git a/.gitignore b/.gitignore index ac56a3320ec85769d2c87c072512f5217eca0c24..fe0d13f4d9eab2c2a8e7001c9ecb69cce1333af1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +paddle/operators/check_t.save +paddle/operators/check_tensor.ls +paddle/operators/tensor.save +python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/ +python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/ +python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/ *.DS_Store build/ build_doc/ @@ -27,5 +33,5 @@ CMakeFiles cmake_install.cmake paddle/.timestamp python/paddlepaddle.egg-info/ -paddle/pybind/pybind.h +paddle/fluid/pybind/pybind.h python/paddle/version.py diff --git a/CMakeLists.txt b/CMakeLists.txt index e8ea828dd2a25f5f47b03e92ae86e083d4425dc9..3a21574b855bc6bc37fefe61de98d657e712cde7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc -include(external/boost) # download, build, install boost +include(external/boost) # download boost include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 @@ -156,6 +156,7 @@ include(rdma) # set rdma libraries include(flags) # set paddle compile flags include(version) # set PADDLE_VERSION include(coveralls) # set code coverage +include(inference_lib) # add paddle fluid inference libraries include_directories("${PADDLE_SOURCE_DIR}") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3..3c36cffcb4eeaaf7f8cff5167777628dd2697e7d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,8 @@ # Contribute Code +You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the +[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329). + We sincerely appreciate your contribution. This document explains our workflow and work style. ## Workflow diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 6bea7cf3022242ce48cc882915f7e71810937283..de94bd5008effef1bf0fd3a125d4aed56e1b7f81 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index c70d83b3f4bb24740ed67b4e2f98a3ced26d1648..dbc676bdac30e0d730206c17a1912d49d4f896eb 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) include_directories(${BOOST_INCLUDE_DIR}) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index d49c8d601102cf865287c33349bff5eee6a90f6d..6a701e076c95372f903a09d35d4208ee73bd584c 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -28,9 +28,3 @@ endif() add_dependencies(eigen3 extern_eigen3) LIST(APPEND external_project_dependencies eigen3) - -IF(NOT WITH_C_API AND WITH_FLUID) - INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen) - INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen) - INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported) -ENDIF() diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 60946304541a20809276c3e665d8524baf209006..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags) LIST(APPEND external_project_dependencies gflags) -IF(WITH_C_API OR WITH_FLUID) +IF(WITH_C_API) INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) IF(ANDROID) INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI}) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 382fbda3b5cfeba893f03871cf65498d20804f36..0c6b3aafcb4e990b9d4549820137474e5968a7aa 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags) LIST(APPEND external_project_dependencies glog) -IF(WITH_C_API OR WITH_FLUID) +IF(WITH_C_API) INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog) IF(ANDROID) INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 365a370a9cfb708379bcff18ae6aa0725d420ae1..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND) SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) - IF(WITH_C_API OR WITH_FLUID) + IF(WITH_C_API) INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) IF(ANDROID) INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 7cb4efa7bff7164464f1210a2b2188226c219ef6..5fa60df7b3f6698ceeee1e4f6d868a3d4bfc7a41 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( -DWITH_TORCH=OFF -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 33ef6860e1d38f4e87c4431addf43f9f8a655fc2..1cb54ba2164fafbfce9f28a3e894ae5e78a9cd68 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -179,20 +179,24 @@ function(cc_library TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (cc_library_SRCS) - if (cc_library_SHARED OR cc_library_shared) # build *.so + if(cc_library_SRCS) + if(cc_library_SHARED OR cc_library_shared) # build *.so add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) else() add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) endif() - if (cc_library_DEPS) + if(cc_library_DEPS) # Don't need link libwarpctc.so - if ("${cc_library_DEPS};" MATCHES "warpctc;") + if("${cc_library_DEPS};" MATCHES "warpctc;") list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) + target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) + if("${cc_library_DEPS}" MATCHES "ARCHIVE_START") + list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END) + endif() add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) endif() # cpplint code style diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7d53554358497762b1cd91c39bdd23c5807af2bc --- /dev/null +++ b/cmake/inference_lib.cmake @@ -0,0 +1,90 @@ +# make package for paddle fluid shared and static library +function(copy TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DSTS DEPS) + cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) + list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) + if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) + message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") + endif() + math(EXPR len "${copy_lib_SRCS_len} - 1") + + add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) + foreach(index RANGE ${len}) + list(GET copy_lib_SRCS ${index} src) + list(GET copy_lib_DSTS ${index} dst) + add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}") + if(IS_DIRECTORY ${src}) + add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}") + else() + add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}") + endif() + endforeach() +endfunction() + +# third party +set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3") +copy(eigen3_lib + SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen + DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported +) + +set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags") +copy(gflags_lib + SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib +) + +set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog") +copy(glog_lib + SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib +) + +IF(NOT PROTOBUF_FOUND) + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf") + copy(protobuf_lib + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY} + DSTS ${dst_dir} ${dst_dir}/lib + ) +ENDIF(NOT PROTOBUF_FOUND) + +# paddle fluid module +set(src_dir "${PADDLE_SOURCE_DIR}/paddle") +set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle") +set(module "framework") +copy(framework_lib DEPS framework_py_proto + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} +) + +set(module "memory") +copy(memory_lib + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail +) + +set(module "inference") +copy(inference_lib DEPENDS paddle_fluid_shared + SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so + DSTS ${dst_dir}/${module} ${dst_dir}/${module} +) + +set(module "platform") +copy(platform_lib + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details +) + +set(module "string") +copy(string_lib + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat +) + +add_custom_target(inference_lib_dist DEPENDS + inference_lib framework_lib memory_lib platform_lib string_lib + gflags_lib glog_lib protobuf_lib eigen3_lib) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 94dd3457fb5b513441c4c8e339e1862de9092517..58ce5d61c950d12630cfe1de354ffc2a2ba1fd59 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn ${SPHINX_CACHE_DIR_CN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) + +add_subdirectory(api) diff --git a/doc/api/CMakeLists.txt b/doc/api/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e0bc1d5b8e799ef86cb92a0dda348b0be4e299a --- /dev/null +++ b/doc/api/CMakeLists.txt @@ -0,0 +1,20 @@ +# configured documentation tools and intermediate build results +set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") + +# Sphinx cache with pickled ReST documents +set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") + +# HTML output director +set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") + +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" + "${BINARY_BUILD_DIR_EN}/conf.py" + @ONLY) + +sphinx_add_target(paddle_api_docs + html + ${BINARY_BUILD_DIR_EN} + ${SPHINX_CACHE_DIR_EN} + ${CMAKE_CURRENT_SOURCE_DIR} + ${SPHINX_HTML_DIR_EN}) diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index e24613b94b422b7cdf9c6383c359fa92a4faf6ff..58c493fd7412cf9dbe507c9622d67dae33a5fb25 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -323,6 +323,12 @@ batch_norm .. autofunction:: paddle.v2.fluid.layers.batch_norm :noindex: +layer_norm +---------- + +.. autofunction:: paddle.v2.fluid.layers.layer_norm + :noindex: + beam_search_decode ------------------ diff --git a/doc/howto/dev/build_cn.md b/doc/build_and_install/build_cn.md similarity index 100% rename from doc/howto/dev/build_cn.md rename to doc/build_and_install/build_cn.md diff --git a/doc/howto/dev/build_en.md b/doc/build_and_install/build_en.md similarity index 100% rename from doc/howto/dev/build_en.md rename to doc/build_and_install/build_en.md diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst similarity index 100% rename from doc/getstarted/build_and_install/build_from_source_cn.rst rename to doc/build_and_install/build_from_source_cn.rst diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst similarity index 100% rename from doc/getstarted/build_and_install/build_from_source_en.rst rename to doc/build_and_install/build_from_source_en.rst diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/build_and_install/docker_install_cn.rst similarity index 100% rename from doc/getstarted/build_and_install/docker_install_cn.rst rename to doc/build_and_install/docker_install_cn.rst diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/build_and_install/docker_install_en.rst similarity index 100% rename from doc/getstarted/build_and_install/docker_install_en.rst rename to doc/build_and_install/docker_install_en.rst diff --git a/doc/build_and_install/index_cn.rst b/doc/build_and_install/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..4220ff2279333f25eb644227100308428bf72362 --- /dev/null +++ b/doc/build_and_install/index_cn.rst @@ -0,0 +1,33 @@ +安装与编译 +========== + +.. _install_steps: + +安装流程 +++++++++ + +PaddlePaddle提供pip和Docker的安装方式: + +.. toctree:: + :maxdepth: 1 + + pip_install_cn.rst + docker_install_cn.rst + build_cn.md + +编译流程 +++++++++ + +.. warning:: + + 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。 + +.. toctree:: + :maxdepth: 1 + + build_from_source_cn.rst + +常见问题解答 +++++++++++ + +`常见问题解答 `_ diff --git a/doc/build_and_install/index_en.rst b/doc/build_and_install/index_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..db6b5be742be1619c52f5f7000bec013e818693d --- /dev/null +++ b/doc/build_and_install/index_en.rst @@ -0,0 +1,34 @@ +Install and Build +================= + +.. _install_steps: + +Install Steps +++++++++ + +You can choose either pip or Docker to complete your install: + +.. toctree:: + :maxdepth: 1 + + pip_install_en.rst + docker_install_en.rst + build_en.md + + +Build from Source +----------------- + +.. warning:: + + We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. + +.. toctree:: + :maxdepth: 1 + + build_from_source_en.md + +FAQ +++++++++++ + +`FAQ `_ diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/build_and_install/paddleci.png similarity index 100% rename from doc/getstarted/build_and_install/paddleci.png rename to doc/build_and_install/paddleci.png diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/build_and_install/pip_install_cn.rst similarity index 100% rename from doc/getstarted/build_and_install/pip_install_cn.rst rename to doc/build_and_install/pip_install_cn.rst diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst similarity index 100% rename from doc/getstarted/build_and_install/pip_install_en.rst rename to doc/build_and_install/pip_install_en.rst diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md index f9991541bc51c6e13ffce4e9cec60f73dc800121..773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf 100644 --- a/doc/design/auto_gradient_check.md +++ b/doc/design/auto_gradient_check.md @@ -1,23 +1,23 @@ -## Auto Gradient Checker Design +## Auto Gradient Check Design -## Backgraound: -- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right: - 1. you should get the right backpropagation formula according to the forward computation. - 2. you should implement it right in CPP. - 3. it's difficult to prepare test data. +## Background: +- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges: + 1. The formula for backpropagation formula should be correct according to the forward computation. + 2. The Implementation of the above shoule be correct in CPP. + 3. It is difficult to prepare an unbiased test data. -- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages: - 1. numerical gradient checker only need forward operator. - 2. user only need to prepare the input data for forward Operator. +- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages: + 1. Numerical gradient checker only needs the forward operator. + 2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator. ## Mathematical Theory -The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful. +The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful. - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) -## Numeric Gradient Implementation +## Numerical Gradient Implementation ### Python Interface ```python def get_numerical_gradient(op, @@ -27,73 +27,76 @@ def get_numerical_gradient(op, delta=0.005, local_scope=None): """ - Get Numeric Gradient for an operator's input. + Get Numerical Gradient for the input of an operator. - :param op: C++ operator instance, could be an network + :param op: C++ operator instance, could be an network. :param input_values: The input variables. Should be an dictionary, whose key is - variable name, and value is numpy array. + variable name, and value is a numpy array. :param output_name: The final output variable name. - :param input_to_check: The input variable with respect to which to compute the gradient. - :param delta: The perturbation value for numeric gradient method. The - smaller delta is, the more accurate result will get. But if that delta is - too small, it will suffer from numerical stability problem. + :param input_to_check: The input variable with respect to which the gradient has to be computed. + :param delta: The perturbation value for numerical gradient method. The + smaller the delta, the more accurate the result. But if the delta is too + small, it will suffer from the numerical stability problem. :param local_scope: The local scope used for get_numeric_gradient. :return: The gradient array in numpy format. """ ``` -### Explaination: +### Explanation: -- Why need `output_name` - - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable. +- Why do we need an `output_name` + - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable. -- Why need `input_to_check` - - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times. +- Why do we need `input_to_check` + - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input. ### Core Algorithm Implementation ```python - # we only compute gradient of one element a time. + # we only compute the gradient of one element a time. # we use a for loop to compute the gradient of each element. for i in xrange(tensor_size): - # get one input element by its index i. - origin = tensor_to_check.get_float_element(i) + # get one input element using the index i. + original = tensor_to_check.get_float_element(i) - # add delta to it, run op and then get the new value of the result tensor. - x_pos = origin + delta + # add delta to it, run the forward op and then + # get the new value of the result tensor. + x_pos = original + delta tensor_to_check.set_float_element(i, x_pos) y_pos = get_output() - # plus delta to this element, run op and get the new value of the result tensor. - x_neg = origin - delta + # Subtract delta from this element, run the op again + # and get the new value of the result tensor. + x_neg = original - delta tensor_to_check.set_float_element(i, x_neg) y_neg = get_output() # restore old value - tensor_to_check.set_float_element(i, origin) + tensor_to_check.set_float_element(i, original) - # compute the gradient of this element and store it into a numpy array. + # compute the gradient of this element and store + # it into a numpy array. gradient_flat[i] = (y_pos - y_neg) / delta / 2 # reshape the gradient result to the shape of the source tensor. return gradient_flat.reshape(tensor_to_check.get_dims()) ``` -## Auto Graident Checker Framework +## Auto Gradient Check Framework Each Operator Kernel has three kinds of Gradient: 1. Numerical gradient 2. CPU kernel gradient -3. GPU kernel gradient (if supported) +3. GPU kernel gradient (if supported by the device) -The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps: +The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps: -1. calculate the numerical gradient -2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient -3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported) +1. Calculate the numerical gradient +2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient. +3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported) #### Python Interface @@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical """ :param forward_op: used to create backward_op :param input_vars: numpy value of input variable. The following - computation will use these variables. - :param inputs_to_check: the input variable with respect to which to compute the gradient. + computation will use these variables. + :param inputs_to_check: the input variable with respect to which the + gradient will be computed. :param output_name: The final output variable name. :param max_relative_error: The relative tolerance parameter. - :param no_grad_set: used when create backward ops + :param no_grad_set: used to create backward ops :param only_cpu: only compute and check gradient on cpu kernel. :return: """ ``` -### How to check if two numpy array is close enough? -if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad +### How to check if two numpy arrays are close enough? +if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad. ```python numerical_grad = ... operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor()) abs_numerical_grad = numpy.abs(numerical_grad) -# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative -# error. +# if abs_numerical_grad is nearly zero, then use abs error for +# numeric_grad, instead of relative error. abs_numerical_grad[abs_numerical_grad < 1e-3] = 1 diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad @@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat) #### Notes: -The Input data for auto gradient checker should be reasonable to avoid numerical stability problem. +The Input data for auto gradient checker should be reasonable to avoid numerical stability problem. -#### Refs: +#### References: - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md new file mode 100644 index 0000000000000000000000000000000000000000..40205350f99722f0b71bfa6f390fe9d01d831966 --- /dev/null +++ b/doc/design/cpp_data_feeding.md @@ -0,0 +1,79 @@ +# C++ Data Feeding + +In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. + +In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching. + +## Reader + +A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data. + + +### `ReaderBase` + +`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces. + +```cpp +class ReaderBase { + public: + explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) { + PADDLE_ENFORCE(!shapes_.empty()); + } + // Read the next batch of data. (A 'batch' can be only one instance) + virtual void ReadNext(std::vector* out) = 0; + // Show whether the next bacth exists. + virtual bool HasNext() const = 0; + + // Reinitialize the reader and read the file from the begin. + virtual void ReInit() = 0; + + // Get a certain read in data's shape. + DDim shape(size_t idx) const; + // Get shapes of all read in data. + std::vector shapes() const { return shapes_; } + // Set shapes of read in data. + void set_shapes(const std::vector& shapes) { shapes_ = shapes; } + + virtual ~ReaderBase() {} + + protected: + std::vector shapes_; +}; +``` + +### `FileReader` and `DecoratedReader` + +These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers. + +All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly. + + +### `ReaderHolder` + +Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code: + +```cpp +var->Get("batch_reader"); +``` + +we have to write: + +```cpp +var->Get("batch_reader"); +``` + +This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible. + +To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader. + +## Related Operators + +To create and invoke readers, some now ops are introduced: + +### `CreateReaderOp` + +Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers. + +### `ReadOp` + +A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables. diff --git a/doc/design/csp.md b/doc/design/csp.md index ba9cacfdea7dcf7c6499b562dfc58400d082f2c8..10d936860fab7e09241e968a63526c7d86d3e568 100644 --- a/doc/design/csp.md +++ b/doc/design/csp.md @@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue. In Go, its implemented i The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll. -It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax. +It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax. ### Type Channel @@ -71,14 +71,14 @@ ch1 := make(chan int, 100) // a channel that can buffer 100 ints. In Fluid, we should be able to do the same: ```python -ch = fluid.make_chan(dtype=INT) -ch1 = fluid.make_chan(dtype=INT, 100) +ch = fluid.make_channel(dtype=INT) +ch1 = fluid.make_channel(dtype=INT, 100) ``` In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16: ```python -ch = fluid.make_chan(dtype=Tensor, etype=float16) +ch = fluid.make_channel(dtype=Tensor, etype=float16) ``` or Tensors of Tensors of float16 etc. @@ -87,8 +87,136 @@ The point here is that we need a consistent way to compose types, like in C++ we ### Send and Recv +Go's CSP implementation depends on data type *channel*. There are two types of channels: + +1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty. +1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels. + +There are four types of actions with a channel: + +1. Create a channel + + ```go + ch := make(chan int) // this is an unbuffered channel + ch := make(chan int, 100) // this is a buffered channel of 100 ints. + ``` + +1. Send + + ```go + ch <- 111 + ``` + +1. Recv + + ```go + y, ok <- ch + ``` + +1. Close + + ```go + close(ch) + ``` + + Please be aware that a closed channel is not a nil channel, which is `var ch chan int`. + +There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms): + +1. A send to a nil channel blocks forever + +1. A receive from a nil channel blocks forever + +1. A send to a closed channel panics + +1. A receive from a closed channel returns the residual values and then zeros. + +In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h) + +The following program illustrates the Python syntax for accessing Fluid buffers. + +```python +import fluid + +buffer_size = 10 +ch = fluid.make_channel(dtype=INT, buffer_size) + +# Now write three elements to the channel +with fluid.while(steps=buffer_size): + fluid.send(ch, step) + +fluid.close_channel(ch) + +with fluid.while(steps=buffer_size): + fluid.print(fluid.recv(ch)) +``` + +The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines. + +```python +import fluid + +ch = fluid.make_channel(dtype=INT) + +with fluid.go(): + fluid.send(ch) + +y = fluid.recv(ch) + +fluid.close_channel(ch) +``` + ### Select +In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready. + +```go + +ch1 := make(chan int) +ch2 := make(chan int, 100) + +x := 0 + +for { + select { + case ch1 <- x: + x := x + 1 + case y <- ch2: + fmt.Println("Received on channel") + default: + fmt.Println("Default") + } + } + +``` + +In Fluid, we should be able to do the same: + +```python +ch1 = fluid.make_chan(dtype=INT) +ch2 = fluid.make_chan(dtype=INT, 100) + +sel = fluid.select() + +with sel.case(ch1, 'w', X): + fluid.layers.increment(X) + +with sel.case(ch2, 'r', Y): + fluid.print("Received on Channel") + +with sel.default(): + fluid.print("Default") + +``` + +In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one. + +- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O. + +- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O. + +- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed. + ## Example Programs ### 1. RPC between Trainers and Parameter Servers diff --git a/doc/design/switch.md b/doc/design/switch.md index 9db1b2782a521c2ff4b28b8f9efcdf1492242ed4..827d0601c621e4a230de28e2baad8e196e69625e 100644 --- a/doc/design/switch.md +++ b/doc/design/switch.md @@ -10,8 +10,7 @@ The following example shows the usage of `fluid.switch`. a = fluid.Var(10) b = fluid.Var(0) -switch = fluid.switch() -with switch.block(): +with switch() as switch: with switch.case(fluid.less_equal(a, 10)): fluid.print("Case 1") with switch.case(fluid.larger(a, 0)): diff --git a/doc/howto/dev/FullyConnected.jpg b/doc/dev/FullyConnected.jpg similarity index 100% rename from doc/howto/dev/FullyConnected.jpg rename to doc/dev/FullyConnected.jpg diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md similarity index 100% rename from doc/howto/dev/contribute_to_paddle_cn.md rename to doc/dev/contribute_to_paddle_cn.md diff --git a/doc/dev/contribute_to_paddle_en.md b/doc/dev/contribute_to_paddle_en.md new file mode 120000 index 0000000000000000000000000000000000000000..f939e75f21a8badb5c40f527abd0e098fe9bc472 --- /dev/null +++ b/doc/dev/contribute_to_paddle_en.md @@ -0,0 +1 @@ +../../CONTRIBUTING.md \ No newline at end of file diff --git a/doc/dev/index_cn.rst b/doc/dev/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..487db868bb2a0a5383d56c3a723912d9fd5910b7 --- /dev/null +++ b/doc/dev/index_cn.rst @@ -0,0 +1,8 @@ +开发标准 +======== + +.. toctree:: + :maxdepth: 1 + + contribute_to_paddle_cn.md + write_docs_cn.rst diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..5dd12d2233cff20e021b90beb94571a2817bd1ad --- /dev/null +++ b/doc/dev/index_en.rst @@ -0,0 +1,9 @@ +Development +------------ + +.. toctree:: + :maxdepth: 1 + + new_layer_en.rst + contribute_to_paddle_en.md + write_docs_en.rst diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/dev/new_layer_cn.rst similarity index 100% rename from doc/howto/dev/new_layer_cn.rst rename to doc/dev/new_layer_cn.rst diff --git a/doc/howto/dev/new_layer_en.rst b/doc/dev/new_layer_en.rst similarity index 100% rename from doc/howto/dev/new_layer_en.rst rename to doc/dev/new_layer_en.rst diff --git a/doc/howto/dev/new_op_cn.md b/doc/dev/new_op_cn.md similarity index 100% rename from doc/howto/dev/new_op_cn.md rename to doc/dev/new_op_cn.md diff --git a/doc/howto/dev/new_op_en.md b/doc/dev/new_op_en.md similarity index 100% rename from doc/howto/dev/new_op_en.md rename to doc/dev/new_op_en.md diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/dev/new_op_kernel_en.md similarity index 100% rename from doc/howto/dev/new_op_kernel_en.md rename to doc/dev/new_op_kernel_en.md diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/dev/use_eigen_cn.md similarity index 100% rename from doc/howto/dev/use_eigen_cn.md rename to doc/dev/use_eigen_cn.md diff --git a/doc/howto/dev/use_eigen_en.md b/doc/dev/use_eigen_en.md similarity index 100% rename from doc/howto/dev/use_eigen_en.md rename to doc/dev/use_eigen_en.md diff --git a/doc/dev/write_docs_cn.rst b/doc/dev/write_docs_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..f79769b810b91c6984016d95f40b89186bfb61b0 --- /dev/null +++ b/doc/dev/write_docs_cn.rst @@ -0,0 +1,111 @@ +############# +如何贡献文档 +############# + +PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 +也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下 + +如何构建文档 +============ + +PaddlePaddle的文档构建有三种方式。 + + +使用PaddlePaddle.org工具 +-------------- +这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。 + +文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest + +注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 + +如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 + +想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 + +使用Docker构建 +-------------- + +使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 + +.. code-block:: bash + + cd TO_YOUR_PADDLE_CLONE_PATH + cd paddle/scripts/tools/build_docs + sh build_docs.sh + +编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 +打开浏览器访问对应目录下的index.html即可访问本地文档。 + +直接构建 +-------- + +如果提示正确,可以执行以下命令编译生成文档,即 + +.. code-block:: bash + + cd TO_YOUR_PADDLE_CLONE_PATH + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON + make gen_proto_py + make paddle_docs paddle_docs_cn + +编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 +打开浏览器访问对应目录下的index.html即可访问本地文档。 + + +如何书写文档 +============ + +PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 + +如何更新www.paddlepaddle.org +============================ + +更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 +目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 +`英文文档 `_ 。 + + +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/dev/write_docs_en.rst b/doc/dev/write_docs_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..f3408a84269aaeef19986c220454555fbbe30e23 --- /dev/null +++ b/doc/dev/write_docs_en.rst @@ -0,0 +1,80 @@ +######################## +Contribute Documentation +######################## + +PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. +Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. +When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content + +How to Build Documentations +============ + +We recommend using PaddlePaddle.org tool to build documentation + + +Use PaddlePaddle.org tool +-------------- +This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. + +The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories. You may only clone the contents you need + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest + +Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + + +If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + +If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 + +How to write Documentations +============ + +PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. + + +How to update www.paddlepaddle.org +============================ + +Please create PRs and submit them to github, please check `Contribute Code `_ 。 +PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and +`English Docs `_ 。 + +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst deleted file mode 100644 index c9ba84c842b530162c92713046e64fdf82bd441b..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/index_cn.rst +++ /dev/null @@ -1,33 +0,0 @@ -安装与编译 -========== - -.. _install_steps: - -安装流程 -++++++++ - -PaddlePaddle提供pip和Docker的安装方式: - -.. toctree:: - :maxdepth: 1 - - pip_install_cn.rst - docker_install_cn.rst - ../../howto/dev/build_cn.md - -编译流程 -++++++++ - -.. warning:: - - 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。 - -.. toctree:: - :maxdepth: 1 - - build_from_source_cn.rst - -常见问题解答 -++++++++++ - -`常见问题解答 `_ diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst deleted file mode 100644 index 32d66d63dd5b2a30d5de4a088dc80b680830cb84..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/index_en.rst +++ /dev/null @@ -1,34 +0,0 @@ -Install and Build -================= - -.. _install_steps: - -Install Steps -++++++++ - -You can choose either pip or Docker to complete your install: - -.. toctree:: - :maxdepth: 1 - - pip_install_en.rst - docker_install_en.rst - ../../howto/dev/build_en.md - - -Build from Source ------------------ - -.. warning:: - - We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. - -.. toctree:: - :maxdepth: 1 - - build_from_source_en.md - -FAQ -++++++++++ - -`FAQ `_ diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst index e695ff283e2e806377a51c559b37e8068360a4ff..608f49f5a969b3291eb43bf2acf582af74e566a1 100644 --- a/doc/getstarted/concepts/use_concepts_cn.rst +++ b/doc/getstarted/concepts/use_concepts_cn.rst @@ -4,7 +4,7 @@ PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API,可以轻松地完成神经网络配置,模型训练等任务。 这里将介绍PaddlePaddle的基本使用概念,并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。 -在使用该文档之前,请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。 +在使用该文档之前,请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。 配置网络 diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index 9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a..1dc141396b95bda776aeff87ac30fad6baf37bd2 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -1,61 +1,8 @@ 新手入门 ============ -.. _quick_install: - -快速安装 -++++++++ - -PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 -执行下面的命令完成快速安装,版本为cpu_avx_openblas: - - .. code-block:: bash - - pip install paddlepaddle - -如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -更详细的安装和编译方法参考: - -.. toctree:: - :maxdepth: 1 - - build_and_install/index_cn.rst - -.. _quick_start: - -快速开始 -++++++++ - -创建一个 housing.py 并粘贴此Python代码: - - .. code-block:: python - - import paddle.v2 as paddle - - # Initialize PaddlePaddle. - paddle.init(use_gpu=False, trainer_count=1) - - # Configure the neural network. - x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) - y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - - # Infer using provided test data. - probs = paddle.infer( - output_layer=y_predict, - parameters=paddle.dataset.uci_housing.model(), - input=[item for item in paddle.dataset.uci_housing.test()()]) - - for i in xrange(len(probs)): - print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) - -执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 - .. toctree:: :maxdepth: 1 + quickstart_cn.rst concepts/use_concepts_cn.rst diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index 063d9d880c82550f7f5d47d3d0b1fff59865bca7..c680e1903750117073bee64cb4d4f4ccfff5ac3d 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -1,61 +1,7 @@ GET STARTED ============ -.. _quick_install: - -Quick Install ----------------------- - -You can use pip to install PaddlePaddle with a single command, supports -CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. -Simply run the following command to install, the version is cpu_avx_openblas: - - .. code-block:: bash - - pip install paddlepaddle - -If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -For more details about installation and build: - .. toctree:: :maxdepth: 1 - build_and_install/index_en.rst - - -.. _quick_start: - -Quick Start -++++++++ - -Create a new file called housing.py, and paste this Python -code: - - - .. code-block:: python - - import paddle.v2 as paddle - - # Initialize PaddlePaddle. - paddle.init(use_gpu=False, trainer_count=1) - - # Configure the neural network. - x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) - y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - - # Infer using provided test data. - probs = paddle.infer( - output_layer=y_predict, - parameters=paddle.dataset.uci_housing.model(), - input=[item for item in paddle.dataset.uci_housing.test()()]) - - for i in xrange(len(probs)): - print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) - -Run :code:`python housing.py` and voila! It should print out a list of predictions -for the test housing data. + quickstart_en.rst diff --git a/doc/getstarted/quickstart_cn.rst b/doc/getstarted/quickstart_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..d511cead262dabafd095f68adb5ffc596a7fe596 --- /dev/null +++ b/doc/getstarted/quickstart_cn.rst @@ -0,0 +1,47 @@ +快速开始 +======== + +快速安装 +-------- + +PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 +执行下面的命令完成快速安装,版本为cpu_avx_openblas: + + .. code-block:: bash + + pip install paddlepaddle + +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +更详细的安装和编译方法参考::ref:`install_steps` 。 + +快速使用 +-------- + +创建一个 housing.py 并粘贴此Python代码: + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 diff --git a/doc/getstarted/quickstart_en.rst b/doc/getstarted/quickstart_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..70f7fe0646068aa79cd72955c6848ac0250c2300 --- /dev/null +++ b/doc/getstarted/quickstart_en.rst @@ -0,0 +1,51 @@ +Quick Start +============ + +Quick Install +------------- + +You can use pip to install PaddlePaddle with a single command, supports +CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. +Simply run the following command to install, the version is cpu_avx_openblas: + + .. code-block:: bash + + pip install paddlepaddle + +If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +For more details about installation and build: :ref:`install_steps` . + +Quick Use +--------- + +Create a new file called housing.py, and paste this Python +code: + + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +Run :code:`python housing.py` and voila! It should print out a list of predictions +for the test housing data. diff --git a/doc/howto/capi/compile_paddle_lib_cn.md b/doc/howto/capi/compile_paddle_lib_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..fd8dec8164580b9dcb716e69f3cc5357639f17d3 --- /dev/null +++ b/doc/howto/capi/compile_paddle_lib_cn.md @@ -0,0 +1,122 @@ +## 安装与编译C-API预测库 + +### 概述 + +使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项: + +必须配置选项: +- `WITH_C_API`,必须配置为`ON`。 + +推荐配置选项: +- `WITH_PYTHON`,推荐配置为`OFF` +- `WITH_SWIG_PY`,推荐配置为`OFF` +- `WITH_GOLANG`,推荐设置为`OFF` + +可选配置选项: +- `WITH_GPU`,可配置为`ON/OFF` +- `WITH_MKL`,可配置为`ON/OFF` + +对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 + +下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): + +```shell +PADDLE_ROOT=/path/of/capi +git clone https://github.com/PaddlePaddle/Paddle.git +cd Paddle +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + -DWITH_GOLANG=OFF \ + -DWITH_PYTHON=OFF \ + -DWITH_MKL=OFF \ + -DWITH_GPU=OFF \ + .. +``` + +执行上述代码生成Makefile文件后,执行:`make && make install`。成功编译后,使用C-API所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件)均会存放于`PADDLE_ROOT`目录中。 + +编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构(包括了编译出的PaddlePaddle头文件和链接库,以及第三方依赖链接库和头文件(如果需要,由链接方式决定)): + +```text +├── include +│   └── paddle +│   ├── arguments.h +│   ├── capi.h +│   ├── capi_private.h +│   ├── config.h +│   ├── error.h +│   ├── gradient_machine.h +│   ├── main.h +│   ├── matrix.h +│   ├── paddle_capi.map +│   └── vector.h +├── lib +│   ├── libpaddle_capi_engine.a +│   ├── libpaddle_capi_layers.a +│   ├── libpaddle_capi_shared.so +│   └── libpaddle_capi_whole.a +└── third_party + ├── gflags + │   ├── include + │   │   └── gflags + │   │   ├── gflags_completions.h + │   │   ├── gflags_declare.h + │   │   ... + │   └── lib + │   └── libgflags.a + ├── glog + │   ├── include + │   │   └── glog + │   │   ├── config.h + │   │   ... + │   └── lib + │   └── libglog.a + ├── openblas + │   ├── include + │   │   ├── cblas.h + │   │   ... + │   └── lib + │   ... + ├── protobuf + │   ├── include + │   │   └── google + │   │   └── protobuf + │   │   ... + │   └── lib + │   └── libprotobuf-lite.a + └── zlib + ├── include + │   ... + └── lib + ... + +``` + +### 链接说明 + +目前提供三种链接方式: + +1. 链接`libpaddle_capi_shared.so` 动态库 + - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意: + 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 + 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 + 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 + - 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**。 + +2. 链接静态库 `libpaddle_capi_whole.a` + - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: + 1. 需要指定`-Wl,--whole-archive`链接选项。 + 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 + 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 + 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 + +3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a` + - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: + 1. 这种链接方式主要用于移动端预测。 + 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 + 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 + 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/capi/images/csr.png similarity index 100% rename from doc/howto/usage/capi/images/csr.png rename to doc/howto/capi/images/csr.png diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/capi/images/sequence_data.png similarity index 100% rename from doc/howto/usage/capi/images/sequence_data.png rename to doc/howto/capi/images/sequence_data.png diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/capi/images/workflow_of_CAPI.png similarity index 100% rename from doc/howto/usage/capi/images/workflow_of_CAPI.png rename to doc/howto/capi/images/workflow_of_CAPI.png diff --git a/doc/howto/capi/index_cn.rst b/doc/howto/capi/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..e589a6d346a1e23a4eed9801e02727c80782ae8b --- /dev/null +++ b/doc/howto/capi/index_cn.rst @@ -0,0 +1,9 @@ +C-API预测库 +================== + +.. toctree:: + :maxdepth: 1 + + compile_paddle_lib_cn.md + organization_of_the_inputs_cn.md + workflow_of_capi_cn.md diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md similarity index 100% rename from doc/howto/usage/capi/organization_of_the_inputs_cn.md rename to doc/howto/capi/organization_of_the_inputs_cn.md diff --git a/doc/howto/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..a61d2267bfdb7c32da528735b20d7c6a531aaa1f --- /dev/null +++ b/doc/howto/capi/workflow_of_capi_cn.md @@ -0,0 +1,119 @@ +## C-API使用流程 + +这篇文档介绍 PaddlePaddle C-API 整体使用流程。 + +### 使用流程 + +使用 C-API 的工作流程如图1所示,分为(1)准备预测模型和(2)预测程序开发两大部分。 + +

+
图1. C-API使用流程示意图 +

+ +- 准备预测模型 + 1. 只将神经网络结构进行序列化。 + - 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。 + 1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。 + - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。 + - 预测时只需加载一个文件便于发布。 + - **注意**:以上两种方式只需选择其一即可。 +- 调用 C-API 开发预测序 + 1. 初始化PaddlePaddle运行环境。 + 1. 加载预测模型。 + 1. 创建神经网络输入,组织输入数据。 + 1. 进行前向计算,获得计算结果。 + 1. 清理和结束。 + +### 准备预测模型 + +准备预测模型部分,我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression),网络接受一幅图片作为输入,将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。 + +调用C-API开发预测程序需要一个训练好的模型,运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本,在终端执行`python mnist_v2.py`,会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。 + +下面,我们将训练结束后存储下来的模型转换成预测模型。 + +1. 序列化神经网络模型配置 + + PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数,使用 C-API 进行预测时,需要将网络结构使用 protobuf 进行序列化,写入文件中。 + + 调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中,示例代码如下: + + ```python + from paddle.utils.dump_v2_config import dump_v2_config + from mnist_v2 import network + + predict = network(is_infer=True) + dump_v2_config(predict, "trainer_config.bin", True) + ``` + + 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程,可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化,结果会写入当前运行目录下的`trainer_config.bin`文件中。 + + 使用这种方式,需要**在运行时将神经网络的多个可学习参数放在同一个目录中**,C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。 + +2. 合并模型文件(可选) + + 一些情况为了便于发布,希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求,可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化,将序列化结果写入一个文件内。 + + 代码示例如下: + + ```python + from paddle.utils.merge_model import merge_v2_modelss + from mnist_v2 import network + + net = network(is_infer=True) + param_file = "models/params_pass_4.tar" + output_file = "output.paddle.model" + merge_v2_model(net, param_file, output_file) + ``` + 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式,运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。 + +#### 注意事项 +1. 为使用C-API,在调用`dump_v2_config`序列化神经网络结构时,参数`binary`必须指定为`True`。 +1. **预测使用的网络结构往往不同于训练**,通常需要去掉网络中的:(1)类别标签层;(2)损失函数层;(3)`evaluator`等,只留下核心计算层,请注意是否需要修改网络结构。 +1. 预测时,可以获取网络中定义的任意多个(大于等于一个)层前向计算的结果,需要哪些层的计算结果作为输出,就将这些层加入一个Python list中,作为调用`dump_v2_config`的第一个参数。 + +### 编写预测代码 + +预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。 + +#### step 1. 初始化PaddlePaddle运行环境 +第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境,该接口接受两个参数:参数的个数和参数列表。 + +#### step2. 加载模型 + +这里介绍C-API使用中的一个重要概念:Gradient Machine。 + +概念上,在 PaddlePaddle 内部,一个GradientMachine类的对象管理着一组计算层(PaddlePaddle Layers)来完成前向和反向计算,并处理与之相关的所有细节。在调用C-API预测时,只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型,下面是C-API提供的,两种常用的模型加载方式: + +1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口,从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型; +1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。 + +- 注意事项 + 1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。 + 1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。 + 1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。 + +#### step 3. 创建神经网络输入,组织输入数据 + +基本使用概念: +- 在PaddlePaddle内部,神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。 +- `Argument` 并不真正“存储”数据,而是将输入/输出数据有机地组织在一起。 +- 在`Argument`内部由:1. `Matrix`(二维矩阵,存储浮点类型输入/输出);2. `IVector`(一维数组,**仅用于存储整型值**,多用于自然语言处理任务)来实际存储数据。 + +C-API支持的所有输入数据类型和他们的组织方式,请参考“输入/输出数据组织”一节。 + +这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。 + +在组织神经网络输入,获取输出时,需要思考完成以下工作: +1. 为每一个输入/输出创建`argument`; +1. 为每一个`argument`创建`paddle_matrix`来存储数据; + +与输入不同的是,不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。 + +#### step 4. 前向计算 + +完成上述准备之后,通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。 + +#### step 5. 清理 + +结束预测之后,对使用的中间变量和资源进行清理和释放。 diff --git a/doc/howto/cluster/cmd_argument_cn.md b/doc/howto/cluster/cmd_argument_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..5c575dd5b53f6e4ea025a8fbaebdb2d1a1f1c9ed --- /dev/null +++ b/doc/howto/cluster/cmd_argument_cn.md @@ -0,0 +1,135 @@ +## 启动参数说明 + +下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。 + +### 启动参数服务器 +执行以下的命令启动一个参数服务器并等待和计算节点的数据交互 +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 +``` + +如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` + +参数说明 + +- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信 +- ports_num:**必选,默认1**,监听的端口个数 +- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数 +- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 + +### 启动计算节点 +执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py) +```bash +$ python train.py +``` + +trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。 + +使用环境变量: + +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +``` + +使用参数: + +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") +``` + +参数说明 + +- use_gpu: **可选,默认False**,是否启用GPU训练 +- trainer_count:**必选,默认1**,当前trainer的线程数目 +- port:**必选,默认7164**,连接到pserver的端口 +- ports_num:**必选,默认1**,连接到pserver的端口个数 +- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 +- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数 +- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 +- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 + + +### 准备数据集 + +参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。 + +在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件: + +```python +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) +``` + +示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`): +``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` + +在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。 + +对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。 + +### 准备训练程序 + +我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。 + +最后,工作空间应如下所示: +``` +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 +``` + +- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。 +- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。 +- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置: + + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` + +- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。 +- `test_data_dir`:包含测试数据集的目录。 diff --git a/doc/howto/cluster/cmd_argument_en.md b/doc/howto/cluster/cmd_argument_en.md new file mode 100644 index 0000000000000000000000000000000000000000..06fd5717564c99e3bb46835a2bd5071dff665f23 --- /dev/null +++ b/doc/howto/cluster/cmd_argument_en.md @@ -0,0 +1,140 @@ +## Command-line arguments + +We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API. + +### Starting parameter server + +Type the below command to start a parameter server which will wait for trainers to connect: + +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 +``` + +If you wish to run parameter servers in background, and save a log file, you can type: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` + +Parameter Description + +- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput. +- ports_num: **required, default 1**, total number of ports will listen on. +- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update. +- num_gradient_servers: **required, default 1**, total number of gradient servers. + +### Starting trainer +Type the command below to start the trainer(name the file whatever you want, like "train.py") + +```bash +$ python train.py +``` + +Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables. + +Use environment viriables: + +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +python train.py +``` + +Pass arguments: + +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") +``` + +Parameter Description + +- use_gpu: **optional, default False**, set to "True" to enable GPU training. +- trainer_count: **required, default 1**, number of threads in current trainer. +- port: **required, default 7164**, port to connect to parameter server. +- ports_num: **required, default 1**, number of ports for communication. +- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. +- num_gradient_servers: **required, default 1**, number of trainers in current job. +- trainer_id: **required, default 0**, ID for every trainer, start from 0. +- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". + +### Prepare Training Dataset + +Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. + +In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers: + +```python +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) +``` + +Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`: + +``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` + +When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node. + +Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job. + +### Prepare Training program + +We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory. + + +Your workspace may looks like: +``` +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 +``` + +- `my_lib.py`: user defined libraries, like PIL libs. This is optional. +- `word_dict.pickle`: dict file for training word embeding. +- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables: + + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` + +- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here. +- `test_data_dir`: containing testing data. diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/cluster/fluid_cluster_train_en.md similarity index 100% rename from doc/howto/usage/cluster/fluid_cluster_train_en.md rename to doc/howto/cluster/fluid_cluster_train_en.md diff --git a/doc/howto/cluster/index_cn.rst b/doc/howto/cluster/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..a60521b4a9646bdc6d9f1bf6da482acc989d8bf3 --- /dev/null +++ b/doc/howto/cluster/index_cn.rst @@ -0,0 +1,22 @@ +分布式训练 +========== + +本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示: + +.. image:: src/ps_cn.png + :width: 500 + +- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。 +- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。 +- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。 + +这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。 + +在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。 + +.. toctree:: + :maxdepth: 1 + + preparations_cn.md + cmd_argument_cn.md + multi_cluster/index_cn.rst diff --git a/doc/howto/cluster/index_en.rst b/doc/howto/cluster/index_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..2640a09dcc904619bc97c9bd3f3d81a9dc307663 --- /dev/null +++ b/doc/howto/cluster/index_en.rst @@ -0,0 +1,22 @@ +Distributed Training +==================== + +In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: + +.. image:: src/ps_en.png + :width: 500 + +- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job. +- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. +- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. + +PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. + +When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. + +.. toctree:: + :maxdepth: 1 + + preparations_en.md + cmd_argument_en.md + multi_cluster/index_en.rst diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/cluster/multi_cluster/fabric_cn.md similarity index 100% rename from doc/howto/usage/cluster/fabric_cn.md rename to doc/howto/cluster/multi_cluster/fabric_cn.md diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md similarity index 100% rename from doc/howto/usage/cluster/fabric_en.md rename to doc/howto/cluster/multi_cluster/fabric_en.md diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..ef56b6ddb38e59f20f7248de1ceb952c7627ce76 --- /dev/null +++ b/doc/howto/cluster/multi_cluster/index_cn.rst @@ -0,0 +1,20 @@ +在不同集群中运行 +================ + +PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括: +- `Kubernetes `_ Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。 +- `OpenMPI `_ 成熟的高性能并行计算框架。 +- `Fabric `_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。 + +对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 `_ 找到。 + +在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。 + +.. toctree:: + :maxdepth: 1 + + fabric_cn.md + openmpi_cn.md + k8s_cn.md + k8s_distributed_cn.md + k8s_aws_cn.md diff --git a/doc/howto/cluster/multi_cluster/index_en.rst b/doc/howto/cluster/multi_cluster/index_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..dac7aaef085c80851c1bbb89250faf2151de4ca6 --- /dev/null +++ b/doc/howto/cluster/multi_cluster/index_en.rst @@ -0,0 +1,19 @@ +Use different clusters +====================== + +PaddlePaddle supports running jobs on several platforms including: +- `Kubernetes `_ open-source system for automating deployment, scaling, and management of containerized applications from Google. +- `OpenMPI `_ Mature high performance parallel computing framework. +- `Fabric `_ A cluster management tool. Write scripts to submit jobs or manage the cluster. + +We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 `_ . + +These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc. + +.. toctree:: + :maxdepth: 1 + + fabric_en.md + openmpi_en.md + k8s_en.md + k8s_aws_en.md diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/cluster/multi_cluster/k8s_aws_cn.md similarity index 100% rename from doc/howto/usage/cluster/k8s_aws_cn.md rename to doc/howto/cluster/multi_cluster/k8s_aws_cn.md diff --git a/doc/howto/usage/cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md similarity index 100% rename from doc/howto/usage/cluster/k8s_aws_en.md rename to doc/howto/cluster/multi_cluster/k8s_aws_en.md diff --git a/doc/howto/usage/cluster/k8s_cn.md b/doc/howto/cluster/multi_cluster/k8s_cn.md similarity index 100% rename from doc/howto/usage/cluster/k8s_cn.md rename to doc/howto/cluster/multi_cluster/k8s_cn.md diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md similarity index 100% rename from doc/howto/usage/cluster/k8s_distributed_cn.md rename to doc/howto/cluster/multi_cluster/k8s_distributed_cn.md diff --git a/doc/howto/usage/cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md similarity index 100% rename from doc/howto/usage/cluster/k8s_en.md rename to doc/howto/cluster/multi_cluster/k8s_en.md diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md similarity index 100% rename from doc/howto/usage/cluster/openmpi_cn.md rename to doc/howto/cluster/multi_cluster/openmpi_cn.md diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md similarity index 100% rename from doc/howto/usage/cluster/openmpi_en.md rename to doc/howto/cluster/multi_cluster/openmpi_en.md diff --git a/doc/howto/usage/cluster/src/add_security_group.png b/doc/howto/cluster/multi_cluster/src/add_security_group.png similarity index 100% rename from doc/howto/usage/cluster/src/add_security_group.png rename to doc/howto/cluster/multi_cluster/src/add_security_group.png diff --git a/doc/howto/usage/cluster/src/create_efs.png b/doc/howto/cluster/multi_cluster/src/create_efs.png similarity index 100% rename from doc/howto/usage/cluster/src/create_efs.png rename to doc/howto/cluster/multi_cluster/src/create_efs.png diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png similarity index 100% rename from doc/howto/usage/cluster/src/k8s-paddle-arch.png rename to doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png diff --git a/doc/howto/usage/cluster/src/k8s_data/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile similarity index 100% rename from doc/howto/usage/cluster/src/k8s_data/Dockerfile rename to doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile diff --git a/doc/howto/usage/cluster/src/k8s_data/README.md b/doc/howto/cluster/multi_cluster/src/k8s_data/README.md similarity index 100% rename from doc/howto/usage/cluster/src/k8s_data/README.md rename to doc/howto/cluster/multi_cluster/src/k8s_data/README.md diff --git a/doc/howto/usage/cluster/src/k8s_data/get_data.sh b/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh similarity index 100% rename from doc/howto/usage/cluster/src/k8s_data/get_data.sh rename to doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh diff --git a/doc/howto/usage/cluster/src/k8s_train/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile similarity index 100% rename from doc/howto/usage/cluster/src/k8s_train/Dockerfile rename to doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile diff --git a/doc/howto/usage/cluster/src/k8s_train/README.md b/doc/howto/cluster/multi_cluster/src/k8s_train/README.md similarity index 100% rename from doc/howto/usage/cluster/src/k8s_train/README.md rename to doc/howto/cluster/multi_cluster/src/k8s_train/README.md diff --git a/doc/howto/usage/cluster/src/k8s_train/start.sh b/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh similarity index 100% rename from doc/howto/usage/cluster/src/k8s_train/start.sh rename to doc/howto/cluster/multi_cluster/src/k8s_train/start.sh diff --git a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py b/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py similarity index 100% rename from doc/howto/usage/cluster/src/k8s_train/start_paddle.py rename to doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py diff --git a/doc/howto/usage/cluster/src/pserver_and_trainer.png b/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png similarity index 100% rename from doc/howto/usage/cluster/src/pserver_and_trainer.png rename to doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png diff --git a/doc/howto/usage/cluster/src/route53_create_recordset.png b/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png similarity index 100% rename from doc/howto/usage/cluster/src/route53_create_recordset.png rename to doc/howto/cluster/multi_cluster/src/route53_create_recordset.png diff --git a/doc/howto/usage/cluster/src/route53_create_zone.png b/doc/howto/cluster/multi_cluster/src/route53_create_zone.png similarity index 100% rename from doc/howto/usage/cluster/src/route53_create_zone.png rename to doc/howto/cluster/multi_cluster/src/route53_create_zone.png diff --git a/doc/howto/usage/cluster/src/worker_security_group.png b/doc/howto/cluster/multi_cluster/src/worker_security_group.png similarity index 100% rename from doc/howto/usage/cluster/src/worker_security_group.png rename to doc/howto/cluster/multi_cluster/src/worker_security_group.png diff --git a/doc/howto/cluster/preparations_cn.md b/doc/howto/cluster/preparations_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..ce40697e703503b66f6306e15ebdb0ce1329991d --- /dev/null +++ b/doc/howto/cluster/preparations_cn.md @@ -0,0 +1,16 @@ +## 环境准备 + +1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。 +1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。 + +安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`): +```bash +$ paddle version +PaddlePaddle 0.10.0, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF +``` diff --git a/doc/howto/cluster/preparations_en.md b/doc/howto/cluster/preparations_en.md new file mode 100644 index 0000000000000000000000000000000000000000..4b77b293907ae0548134fc65ceed3aa0ed0b845d --- /dev/null +++ b/doc/howto/cluster/preparations_en.md @@ -0,0 +1,17 @@ +## Preparations + +1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes". +2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html). + +After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`): + +```bash +$ paddle version +PaddlePaddle 0.10.0rc, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF +``` diff --git a/doc/howto/usage/cluster/src/Dockerfile b/doc/howto/cluster/src/Dockerfile similarity index 100% rename from doc/howto/usage/cluster/src/Dockerfile rename to doc/howto/cluster/src/Dockerfile diff --git a/doc/howto/usage/cluster/src/efs_mount.png b/doc/howto/cluster/src/efs_mount.png similarity index 100% rename from doc/howto/usage/cluster/src/efs_mount.png rename to doc/howto/cluster/src/efs_mount.png diff --git a/doc/howto/usage/cluster/src/managed_policy.png b/doc/howto/cluster/src/managed_policy.png similarity index 100% rename from doc/howto/usage/cluster/src/managed_policy.png rename to doc/howto/cluster/src/managed_policy.png diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/cluster/src/ps_cn.png similarity index 100% rename from doc/howto/usage/cluster/src/trainer_cn.png rename to doc/howto/cluster/src/ps_cn.png diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/cluster/src/ps_en.png similarity index 100% rename from doc/howto/usage/cluster/src/trainer.png rename to doc/howto/cluster/src/ps_en.png diff --git a/doc/howto/cluster/src/trainer.png b/doc/howto/cluster/src/trainer.png new file mode 100644 index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0 Binary files /dev/null and b/doc/howto/cluster/src/trainer.png differ diff --git a/doc/howto/cluster/src/trainer_cn.png b/doc/howto/cluster/src/trainer_cn.png new file mode 100644 index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc Binary files /dev/null and b/doc/howto/cluster/src/trainer_cn.png differ diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/cluster/src/word2vec/api_train_v2.py similarity index 100% rename from doc/howto/usage/cluster/src/word2vec/api_train_v2.py rename to doc/howto/cluster/src/word2vec/api_train_v2.py diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py similarity index 100% rename from doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py rename to doc/howto/cluster/src/word2vec/api_train_v2_cluster.py diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/cluster/src/word2vec/prepare.py similarity index 100% rename from doc/howto/usage/cluster/src/word2vec/prepare.py rename to doc/howto/cluster/src/word2vec/prepare.py diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/cmd_parameter/arguments_cn.md similarity index 100% rename from doc/howto/usage/cmd_parameter/arguments_cn.md rename to doc/howto/cmd_parameter/arguments_cn.md diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/cmd_parameter/arguments_en.md similarity index 100% rename from doc/howto/usage/cmd_parameter/arguments_en.md rename to doc/howto/cmd_parameter/arguments_en.md diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/cmd_parameter/detail_introduction_cn.md similarity index 100% rename from doc/howto/usage/cmd_parameter/detail_introduction_cn.md rename to doc/howto/cmd_parameter/detail_introduction_cn.md diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/cmd_parameter/detail_introduction_en.md similarity index 100% rename from doc/howto/usage/cmd_parameter/detail_introduction_en.md rename to doc/howto/cmd_parameter/detail_introduction_en.md diff --git a/doc/howto/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..17b379f6295d66d864e2b53108012eff5895d96b --- /dev/null +++ b/doc/howto/cmd_parameter/index_cn.rst @@ -0,0 +1,11 @@ +.. _cmd_line_index: + +命令行参数设置 +=============== + +.. toctree:: + :maxdepth: 1 + + use_case_cn.md + arguments_cn.md + detail_introduction_cn.md diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/cmd_parameter/index_en.rst similarity index 100% rename from doc/howto/usage/cmd_parameter/index_en.rst rename to doc/howto/cmd_parameter/index_en.rst diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/cmd_parameter/use_case_cn.md similarity index 100% rename from doc/howto/usage/cmd_parameter/use_case_cn.md rename to doc/howto/cmd_parameter/use_case_cn.md diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/cmd_parameter/use_case_en.md similarity index 100% rename from doc/howto/usage/cmd_parameter/use_case_en.md rename to doc/howto/cmd_parameter/use_case_en.md diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst deleted file mode 100644 index 9ecab5594cff47cde4700b7ce0f58013a960a16e..0000000000000000000000000000000000000000 --- a/doc/howto/deep_model/rnn/index_cn.rst +++ /dev/null @@ -1,10 +0,0 @@ -RNN相关模型 -=========== - -.. toctree:: - :maxdepth: 1 - - rnn_config_cn.rst - recurrent_group_cn.md - hierarchical_layer_cn.rst - hrnn_rnn_api_compare_cn.rst diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md deleted file mode 120000 index c97564d93a7f0a753a23cd97d2467d595bd154ff..0000000000000000000000000000000000000000 --- a/doc/howto/dev/contribute_to_paddle_en.md +++ /dev/null @@ -1 +0,0 @@ -../../../CONTRIBUTING.md \ No newline at end of file diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst deleted file mode 100644 index 1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c..0000000000000000000000000000000000000000 --- a/doc/howto/dev/write_docs_cn.rst +++ /dev/null @@ -1,111 +0,0 @@ -################## -如何贡献/修改文档 -################## - -PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 -也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下 - -如何构建文档 -============ - -PaddlePaddle的文档构建有三种方式。 - - -使用PaddlePaddle.org工具 --------------- -这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。 - -文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - - # Please specify the working directory through -v - docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest - -注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 -之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 -编译后的文件将被存储在工作目录 /.ppo_workspace/content。 - -如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories and PaddlePaddle.org - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git - - # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd - export CONTENT_DIR= - export ENV='' - cd PaddlePaddle.org/portal/ - pip install -r requirements.txt - python manage.py runserver - -工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 -之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 -编译后的文件将被存储在工作目录 /.ppo_workspace/content。 - -想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 - -使用Docker构建 --------------- - -使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 - -.. code-block:: bash - - cd TO_YOUR_PADDLE_CLONE_PATH - cd paddle/scripts/tools/build_docs - sh build_docs.sh - -编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 -打开浏览器访问对应目录下的index.html即可访问本地文档。 - -直接构建 --------- - -如果提示正确,可以执行以下命令编译生成文档,即 - -.. code-block:: bash - - cd TO_YOUR_PADDLE_CLONE_PATH - mkdir -p build - cd build - cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON - make gen_proto_py - make paddle_docs paddle_docs_cn - -编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 -打开浏览器访问对应目录下的index.html即可访问本地文档。 - - -如何书写文档 -============ - -PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 - -如何更新www.paddlepaddle.org -============================ - -更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 -目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 -`英文文档 `_ 。 - - -.. _cmake: https://cmake.org/ -.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst deleted file mode 100644 index b3ef07eb1d0012827df8e6a4f27c5fa643649492..0000000000000000000000000000000000000000 --- a/doc/howto/dev/write_docs_en.rst +++ /dev/null @@ -1,80 +0,0 @@ -################## -Contribute Documentation -################## - -PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. -Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. -When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content - -How to Build Documentations -============ - -We recommend using PaddlePaddle.org tool to build documentation - - -Use PaddlePaddle.org tool --------------- -This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. - -The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories. You may only clone the contents you need - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - - # Please specify the working directory through -v - docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest - -Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command -Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation -The compiled documentations will be stored in /.ppo_workspace/content - - -If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories and PaddlePaddle.org - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git - - # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd - export CONTENT_DIR= - export ENV='' - cd PaddlePaddle.org/portal/ - pip install -r requirements.txt - python manage.py runserver - -Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation -The compiled documentations will be stored in /.ppo_workspace/content - -If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 - -How to write Documentations -============ - -PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. - - -How to update www.paddlepaddle.org -============================ - -Please create PRs and submit them to github, please check `Contribute Code `_ 。 -PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and -`English Docs `_ 。 - -.. _cmake: https://cmake.org/ -.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst index e0c69f7a6a4043abe999af6c8dd2555178b68424..0c534f107b6e047035c424ed2ea59f3982799b63 100644 --- a/doc/howto/index_cn.rst +++ b/doc/howto/index_cn.rst @@ -1,37 +1,11 @@ -进阶指南 +进阶使用 ======== -使用说明 --------- - -.. toctree:: - :maxdepth: 1 - - usage/cmd_parameter/index_cn.rst - usage/cluster/cluster_train_cn.md - usage/capi/index_cn.rst - -开发标准 --------- - -.. toctree:: - :maxdepth: 1 - - dev/contribute_to_paddle_cn.md - dev/write_docs_cn.rst - -模型配置 --------- - -.. toctree:: - :maxdepth: 1 - - deep_model/rnn/index_cn.rst - -性能优化 --------- - .. toctree:: :maxdepth: 1 + cmd_parameter/index_cn.rst + cluster/index_cn.rst + capi/index_cn.rst + rnn/index_cn.rst optimization/gpu_profiling_cn.rst diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst index 6d1bf7dfc003da6de31410ee0a7959233adfaf76..ae8b86f75b5de770312fb2fdc46db490a18e5ff6 100644 --- a/doc/howto/index_en.rst +++ b/doc/howto/index_en.rst @@ -1,37 +1,10 @@ HOW TO ======= -Usage -------- - -.. toctree:: - :maxdepth: 1 - - usage/cmd_parameter/index_en.rst - usage/cluster/cluster_train_en.md - -Development ------------- - -.. toctree:: - :maxdepth: 1 - - dev/new_layer_en.rst - dev/contribute_to_paddle_en.md - dev/write_docs_en.rst - -Configuration -------------- - -.. toctree:: - :maxdepth: 1 - - deep_model/rnn/index_en.rst - -Optimization -------------- - .. toctree:: :maxdepth: 1 + cmd_parameter/index_en.rst + cluster/index_en.rst + rnn/index_en.rst optimization/gpu_profiling_en.rst diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling_en.md similarity index 100% rename from doc/howto/optimization/cpu_profiling.md rename to doc/howto/optimization/cpu_profiling_en.md diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst index e2b0b0396e0034b01ed2c5081effdd3bcabf31ae..0239eef4f118197bf92f9fc7d323be58344b0ded 100644 --- a/doc/howto/optimization/gpu_profiling_cn.rst +++ b/doc/howto/optimization/gpu_profiling_cn.rst @@ -1,6 +1,6 @@ -================== -GPU性能分析与调优 -================== +============ +GPU性能调优 +============ .. contents:: diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/rnn/hierarchical_layer_cn.rst similarity index 100% rename from doc/howto/deep_model/rnn/hierarchical_layer_cn.rst rename to doc/howto/rnn/hierarchical_layer_cn.rst diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst similarity index 100% rename from doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst rename to doc/howto/rnn/hrnn_rnn_api_compare_cn.rst diff --git a/doc/howto/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..bcc8c2f46eb662ec3650e829a77992224dbbb8e7 --- /dev/null +++ b/doc/howto/rnn/index_cn.rst @@ -0,0 +1,10 @@ +RNN模型 +=========== + +.. toctree:: + :maxdepth: 1 + + rnn_config_cn.rst + recurrent_group_cn.md + hierarchical_layer_cn.rst + hrnn_rnn_api_compare_cn.rst diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/rnn/index_en.rst similarity index 100% rename from doc/howto/deep_model/rnn/index_en.rst rename to doc/howto/rnn/index_en.rst diff --git a/doc/howto/deep_model/rnn/recurrent_group_cn.md b/doc/howto/rnn/recurrent_group_cn.md similarity index 100% rename from doc/howto/deep_model/rnn/recurrent_group_cn.md rename to doc/howto/rnn/recurrent_group_cn.md diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/rnn/rnn_config_cn.rst similarity index 100% rename from doc/howto/deep_model/rnn/rnn_config_cn.rst rename to doc/howto/rnn/rnn_config_cn.rst diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/rnn/rnn_config_en.rst similarity index 100% rename from doc/howto/deep_model/rnn/rnn_config_en.rst rename to doc/howto/rnn/rnn_config_en.rst diff --git a/doc/howto/deep_model/rnn/src/bi_lstm.jpg b/doc/howto/rnn/src/bi_lstm.jpg similarity index 100% rename from doc/howto/deep_model/rnn/src/bi_lstm.jpg rename to doc/howto/rnn/src/bi_lstm.jpg diff --git a/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png b/doc/howto/rnn/src/encoder-decoder-attention-model.png similarity index 100% rename from doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png rename to doc/howto/rnn/src/encoder-decoder-attention-model.png diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn.dot b/doc/howto/rnn/src/glossary_rnn.dot similarity index 100% rename from doc/howto/deep_model/rnn/src/glossary_rnn.dot rename to doc/howto/rnn/src/glossary_rnn.dot diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot b/doc/howto/rnn/src/glossary_rnn_with_memory.dot similarity index 100% rename from doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot rename to doc/howto/rnn/src/glossary_rnn_with_memory.dot diff --git a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot similarity index 100% rename from doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot rename to doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot diff --git a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot b/doc/howto/rnn/src/simple_full_recurrent.dot similarity index 100% rename from doc/howto/deep_model/rnn/src/simple_full_recurrent.dot rename to doc/howto/rnn/src/simple_full_recurrent.dot diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/usage/capi/compile_paddle_lib_cn.md deleted file mode 100644 index ac5ecffe2ea8ddc3703a32e9a0a8ee83bbe5dd14..0000000000000000000000000000000000000000 --- a/doc/howto/usage/capi/compile_paddle_lib_cn.md +++ /dev/null @@ -1,122 +0,0 @@ -## 编译 PaddlePaddle 预测库 - -### 概述 - -使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项: - -必须配置选项: -- `WITH_C_API`,必须配置为`ON`。 - -推荐配置选项: -- `WITH_PYTHON`,推荐配置为`OFF` -- `WITH_SWIG_PY`,推荐配置为`OFF` -- `WITH_GOLANG`,推荐设置为`OFF` - -可选配置选项: -- `WITH_GPU`,可配置为`ON/OFF` -- `WITH_MKL`,可配置为`ON/OFF` - -对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 - -下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): - -```shell -PADDLE_ROOT=/path/of/capi -git clone https://github.com/PaddlePaddle/Paddle.git -cd Paddle -mkdir build -cd build -cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - -DWITH_GOLANG=OFF \ - -DWITH_PYTHON=OFF \ - -DWITH_MKL=OFF \ - -DWITH_GPU=OFF \ - .. -``` - -执行上述代码生成Makefile文件后,执行:`make && make install`。成功编译后,使用C-API所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件)均会存放于`PADDLE_ROOT`目录中。 - -编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构(包括了编译出的PaddlePaddle头文件和链接库,以及第三方依赖链接库和头文件(如果需要,由链接方式决定)): - -```text -├── include -│   └── paddle -│   ├── arguments.h -│   ├── capi.h -│   ├── capi_private.h -│   ├── config.h -│   ├── error.h -│   ├── gradient_machine.h -│   ├── main.h -│   ├── matrix.h -│   ├── paddle_capi.map -│   └── vector.h -├── lib -│   ├── libpaddle_capi_engine.a -│   ├── libpaddle_capi_layers.a -│   ├── libpaddle_capi_shared.so -│   └── libpaddle_capi_whole.a -└── third_party - ├── gflags - │   ├── include - │   │   └── gflags - │   │   ├── gflags_completions.h - │   │   ├── gflags_declare.h - │   │   ... - │   └── lib - │   └── libgflags.a - ├── glog - │   ├── include - │   │   └── glog - │   │   ├── config.h - │   │   ... - │   └── lib - │   └── libglog.a - ├── openblas - │   ├── include - │   │   ├── cblas.h - │   │   ... - │   └── lib - │   ... - ├── protobuf - │   ├── include - │   │   └── google - │   │   └── protobuf - │   │   ... - │   └── lib - │   └── libprotobuf-lite.a - └── zlib - ├── include - │   ... - └── lib - ... - -``` - -### 链接说明 - -目前提供三种链接方式: - -1. 链接`libpaddle_capi_shared.so` 动态库 - - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意: - 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 - 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 - 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 - - 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**。 - -2. 链接静态库 `libpaddle_capi_whole.a` - - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: - 1. 需要指定`-Wl,--whole-archive`链接选项。 - 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 - 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 - 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 - -3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a` - - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: - 1. 这种链接方式主要用于移动端预测。 - 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 - 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 - 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/usage/capi/index_cn.rst deleted file mode 100644 index fd774fbc742671c5a8009cb742f2c9d06a525199..0000000000000000000000000000000000000000 --- a/doc/howto/usage/capi/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -PaddlePaddle C-API -================== - -.. toctree:: - :maxdepth: 1 - - compile_paddle_lib_cn.md - organization_of_the_inputs_cn.md - workflow_of_capi_cn.md diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/usage/capi/workflow_of_capi_cn.md deleted file mode 100644 index e0a42fff12cf0f53dee18165e059150861524f74..0000000000000000000000000000000000000000 --- a/doc/howto/usage/capi/workflow_of_capi_cn.md +++ /dev/null @@ -1,119 +0,0 @@ -## C-API 使用流程 - -这篇文档介绍 PaddlePaddle C-API 整体使用流程。 - -### 使用流程 - -使用 C-API 的工作流程如图1所示,分为(1)准备预测模型和(2)预测程序开发两大部分。 - -

-
图1. C-API使用流程示意图 -

- -- 准备预测模型 - 1. 只将神经网络结构进行序列化。 - - 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。 - 1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。 - - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。 - - 预测时只需加载一个文件便于发布。 - - **注意**:以上两种方式只需选择其一即可。 -- 调用 C-API 开发预测序 - 1. 初始化PaddlePaddle运行环境。 - 1. 加载预测模型。 - 1. 创建神经网络输入,组织输入数据。 - 1. 进行前向计算,获得计算结果。 - 1. 清理和结束。 - -### 准备预测模型 - -准备预测模型部分,我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression),网络接受一幅图片作为输入,将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。 - -调用C-API开发预测程序需要一个训练好的模型,运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本,在终端执行`python mnist_v2.py`,会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。 - -下面,我们将训练结束后存储下来的模型转换成预测模型。 - -1. 序列化神经网络模型配置 - - PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数,使用 C-API 进行预测时,需要将网络结构使用 protobuf 进行序列化,写入文件中。 - - 调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中,示例代码如下: - - ```python - from paddle.utils.dump_v2_config import dump_v2_config - from mnist_v2 import network - - predict = network(is_infer=True) - dump_v2_config(predict, "trainer_config.bin", True) - ``` - - 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程,可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化,结果会写入当前运行目录下的`trainer_config.bin`文件中。 - - 使用这种方式,需要**在运行时将神经网络的多个可学习参数放在同一个目录中**,C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。 - -2. 合并模型文件(可选) - - 一些情况为了便于发布,希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求,可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化,将序列化结果写入一个文件内。 - - 代码示例如下: - - ```python - from paddle.utils.merge_model import merge_v2_modelss - from mnist_v2 import network - - net = network(is_infer=True) - param_file = "models/params_pass_4.tar" - output_file = "output.paddle.model" - merge_v2_model(net, param_file, output_file) - ``` - 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式,运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。 - -#### 注意事项 -1. 为使用C-API,在调用`dump_v2_config`序列化神经网络结构时,参数`binary`必须指定为`True`。 -1. **预测使用的网络结构往往不同于训练**,通常需要去掉网络中的:(1)类别标签层;(2)损失函数层;(3)`evaluator`等,只留下核心计算层,请注意是否需要修改网络结构。 -1. 预测时,可以获取网络中定义的任意多个(大于等于一个)层前向计算的结果,需要哪些层的计算结果作为输出,就将这些层加入一个Python list中,作为调用`dump_v2_config`的第一个参数。 - -### 编写预测代码 - -预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。 - -#### step 1. 初始化PaddlePaddle运行环境 -第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境,该接口接受两个参数:参数的个数和参数列表。 - -#### step2. 加载模型 - -这里介绍C-API使用中的一个重要概念:Gradient Machine。 - -概念上,在 PaddlePaddle 内部,一个GradientMachine类的对象管理着一组计算层(PaddlePaddle Layers)来完成前向和反向计算,并处理与之相关的所有细节。在调用C-API预测时,只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型,下面是C-API提供的,两种常用的模型加载方式: - -1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口,从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型; -1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。 - -- 注意事项 - 1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。 - 1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。 - 1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。 - -#### step 3. 创建神经网络输入,组织输入数据 - -基本使用概念: -- 在PaddlePaddle内部,神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。 -- `Argument` 并不真正“存储”数据,而是将输入/输出数据有机地组织在一起。 -- 在`Argument`内部由:1. `Matrix`(二维矩阵,存储浮点类型输入/输出);2. `IVector`(一维数组,**仅用于存储整型值**,多用于自然语言处理任务)来实际存储数据。 - -C-API支持的所有输入数据类型和他们的组织方式,请参考“输入/输出数据组织”一节。 - -这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。 - -在组织神经网络输入,获取输出时,需要思考完成以下工作: -1. 为每一个输入/输出创建`argument`; -1. 为每一个`argument`创建`paddle_matrix`来存储数据; - -与输入不同的是,不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。 - -#### step 4. 前向计算 - -完成上述准备之后,通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。 - -#### step 5. 清理 - -结束预测之后,对使用的中间变量和资源进行清理和释放。 diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md deleted file mode 100644 index 0f3db59607fb6b43da01f5fdb46949087517ed6c..0000000000000000000000000000000000000000 --- a/doc/howto/usage/cluster/cluster_train_cn.md +++ /dev/null @@ -1,188 +0,0 @@ -# 分布式训练 - - -## 概述 - -本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示: - - - -- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。 -- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。 -- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。 - -这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。 - -在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。 - - -## 环境准备 - -1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。 -1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。 - -安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`): -```bash -$ paddle version -PaddlePaddle 0.10.0, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF -``` - -下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。 - -## 启动参数说明 -### 启动参数服务器 -执行以下的命令启动一个参数服务器并等待和计算节点的数据交互 -```bash -$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 -``` - -如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行: -```bash -$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log -``` - -参数说明 - -- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信 -- ports_num:**必选,默认1**,监听的端口个数 -- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数 -- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 - -### 启动计算节点 -执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py) -```bash -$ python train.py -``` - -trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。 - -使用环境变量: - -```bash -export PADDLE_INIT_USE_GPU=False -export PADDLE_INIT_TRAINER_COUNT=1 -export PADDLE_INIT_PORT=7164 -export PADDLE_INIT_PORTS_NUM=1 -export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 -export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 -export PADDLE_INIT_TRAINER_ID=0 -export PADDLE_INIT_PSERVERS=127.0.0.1 -``` - -使用参数: - -```python -paddle.init( - use_gpu=False, - trainer_count=1, - port=7164, - ports_num=1, - ports_num_for_sparse=1, - num_gradient_servers=1, - trainer_id=0, - pservers="127.0.0.1") -``` - -参数说明 - -- use_gpu: **可选,默认False**,是否启用GPU训练 -- trainer_count:**必选,默认1**,当前trainer的线程数目 -- port:**必选,默认7164**,连接到pserver的端口 -- ports_num:**必选,默认1**,连接到pserver的端口个数 -- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 -- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数 -- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 -- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 - - -### 准备数据集 - -参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。 - -在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件: - -```python -import os -train_list = [] -flist = os.listdir("/train_data/") -for f in flist: - suffix = int(f.split("-")[1]) - if suffix % TRAINER_COUNT == TRAINER_ID: - train_list.append(f) -``` - -示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`): -``` -train.txt -train.txt-00000 -train.txt-00001 -train.txt-00002 -test.txt -test.txt-00000 -test.txt-00001 -test.txt-00002 -``` - -在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。 - -对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。 - -### 准备训练程序 - -我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。 - -最后,工作空间应如下所示: -``` -. -|-- my_lib.py -|-- word_dict.pickle -|-- train.py -|-- train_data_dir/ -| |-- train.txt-00000 -| |-- train.txt-00001 -| |-- train.txt-00002 -`-- test_data_dir/ - |-- test.txt-00000 - |-- test.txt-00001 - `-- test.txt-00002 -``` - -- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。 -- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。 -- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置: - - ```python - cluster_train_file = "./train_data_dir/train/train.txt" - cluster_test_file = "./test_data_dir/test/test.txt" - node_id = os.getenv("OMPI_COMM_WORLD_RANK") - if not node_id: - raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") - ``` - -- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。 -- `test_data_dir`:包含测试数据集的目录。 - -## 使用分布式计算平台或工具 - -PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括: -- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。 -- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。 -- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。 - -对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。 - -在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。 - -## 在不同集群中运行 - - - [fabric集群](fabric_cn.md) - - [openmpi集群](openmpi_cn.md) - - [kubernetes单机](k8s_cn.md) - - [kubernetes distributed分布式](k8s_distributed_cn.md) - - [AWS上运行kubernetes集群训练](k8s_aws_cn.md) diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md deleted file mode 100644 index f9424f8f1a29fcf001c4e7976086512b22f6e858..0000000000000000000000000000000000000000 --- a/doc/howto/usage/cluster/cluster_train_en.md +++ /dev/null @@ -1,191 +0,0 @@ -# Distributed Training - -## Introduction - -In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: - - - -- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job. -- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. -- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. - -PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. - -When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. - -## Preparations -1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes". -2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html). - -After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`): - -```bash -$ paddle version -PaddlePaddle 0.10.0rc, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF -``` - -We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API. - -## Command-line arguments - -### Starting parameter server - -Type the below command to start a parameter server which will wait for trainers to connect: - -```bash -$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 -``` - -If you wish to run parameter servers in background, and save a log file, you can type: -```bash -$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log -``` - -Parameter Description - -- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput. -- ports_num: **required, default 1**, total number of ports will listen on. -- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update. -- num_gradient_servers: **required, default 1**, total number of gradient servers. - -### Starting trainer -Type the command below to start the trainer(name the file whatever you want, like "train.py") - -```bash -$ python train.py -``` - -Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables. - -Use environment viriables: - -```bash -export PADDLE_INIT_USE_GPU=False -export PADDLE_INIT_TRAINER_COUNT=1 -export PADDLE_INIT_PORT=7164 -export PADDLE_INIT_PORTS_NUM=1 -export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 -export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 -export PADDLE_INIT_TRAINER_ID=0 -export PADDLE_INIT_PSERVERS=127.0.0.1 -python train.py -``` - -Pass arguments: - -```python -paddle.init( - use_gpu=False, - trainer_count=1, - port=7164, - ports_num=1, - ports_num_for_sparse=1, - num_gradient_servers=1, - trainer_id=0, - pservers="127.0.0.1") -``` - -Parameter Description - -- use_gpu: **optional, default False**, set to "True" to enable GPU training. -- trainer_count: **required, default 1**, number of threads in current trainer. -- port: **required, default 7164**, port to connect to parameter server. -- ports_num: **required, default 1**, number of ports for communication. -- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. -- num_gradient_servers: **required, default 1**, number of trainers in current job. -- trainer_id: **required, default 0**, ID for every trainer, start from 0. -- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". - -### Prepare Training Dataset - -Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. - -In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers: - -```python -import os -train_list = [] -flist = os.listdir("/train_data/") -for f in flist: - suffix = int(f.split("-")[1]) - if suffix % TRAINER_COUNT == TRAINER_ID: - train_list.append(f) -``` - -Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`: - -``` -train.txt -train.txt-00000 -train.txt-00001 -train.txt-00002 -test.txt -test.txt-00000 -test.txt-00001 -test.txt-00002 -``` - -When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node. - -Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job. - -### Prepare Training program - -We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory. - - -Your workspace may looks like: -``` -. -|-- my_lib.py -|-- word_dict.pickle -|-- train.py -|-- train_data_dir/ -| |-- train.txt-00000 -| |-- train.txt-00001 -| |-- train.txt-00002 -`-- test_data_dir/ - |-- test.txt-00000 - |-- test.txt-00001 - `-- test.txt-00002 -``` - -- `my_lib.py`: user defined libraries, like PIL libs. This is optional. -- `word_dict.pickle`: dict file for training word embeding. -- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables: - - ```python - cluster_train_file = "./train_data_dir/train/train.txt" - cluster_test_file = "./test_data_dir/test/test.txt" - node_id = os.getenv("OMPI_COMM_WORLD_RANK") - if not node_id: - raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") - ``` - -- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here. -- `test_data_dir`: containing testing data. - -## Use cluster platforms or cluster management tools - -PaddlePaddle supports running jobs on several platforms including: -- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google. -- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework. -- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster. - -We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2). - -These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc. - -## Use different clusters - - - [fabric](fabric_en.md) - - [openmpi](openmpi_en.md) - - [kubernetes](k8s_en.md) - - [kubernetes on AWS](k8s_aws_en.md) diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst deleted file mode 100644 index 4c8729821110b9aec99351fc0a83a1ba75a8a2bb..0000000000000000000000000000000000000000 --- a/doc/howto/usage/cmd_parameter/index_cn.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _cmd_line_index: - -设置命令行参数 -=============== - -.. toctree:: - :maxdepth: 1 - - use_case_cn.md - arguments_cn.md - detail_introduction_cn.md diff --git a/doc/index_cn.rst b/doc/index_cn.rst index ada51c2d73263898b2c748437f8eb0f30b537073..0f645db6fc5d0f84bbe0cbb335677752e3a355ea 100644 --- a/doc/index_cn.rst +++ b/doc/index_cn.rst @@ -5,7 +5,7 @@ PaddlePaddle 文档 :maxdepth: 1 getstarted/index_cn.rst + build_and_install/index_cn.rst howto/index_cn.rst - api/index_cn.rst + dev/index_cn.rst faq/index_cn.rst - mobile/index_cn.rst diff --git a/doc/index_en.rst b/doc/index_en.rst index 23b64b6cadf776d44c4d0aa5a550ffe24be13b18..166f56c28f464563a0b36007f58cebb58c286916 100644 --- a/doc/index_en.rst +++ b/doc/index_en.rst @@ -5,6 +5,6 @@ PaddlePaddle Documentation :maxdepth: 1 getstarted/index_en.rst + build_and_install/index_en.rst howto/index_en.rst - api/index_en.rst - mobile/index_en.rst + dev/index_en.rst diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst deleted file mode 100644 index 1d99666e58b7043b85b0203ee0dfcd1957710161..0000000000000000000000000000000000000000 --- a/doc/mobile/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -MOBILE -====== - -.. toctree:: - :maxdepth: 1 - - cross_compiling_for_android_cn.md - cross_compiling_for_ios_cn.md - cross_compiling_for_raspberry_cn.md diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst deleted file mode 100644 index ef421dacad458828cadf8cf505375d6c4bfd9dde..0000000000000000000000000000000000000000 --- a/doc/mobile/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -MOBILE -====== - -.. toctree:: - :maxdepth: 1 - - cross_compiling_for_android_en.md - cross_compiling_for_ios_en.md - cross_compiling_for_raspberry_en.md diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 3f9c132ef6ae03c7614e10484715676c8019821e..c7deba2ab475d3c4f2c95327af77af7031b591fd 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -19,12 +19,7 @@ else() endif() if(NOT ANDROID AND NOT IOS) - add_subdirectory(memory) - add_subdirectory(platform) - add_subdirectory(framework) - add_subdirectory(operators) - add_subdirectory(pybind) - add_subdirectory(inference) + add_subdirectory(fluid) endif() if(WITH_SWIG_PY) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b4191518c45d0579f800ecb901dcd9667e17d5 --- /dev/null +++ b/paddle/fluid/CMakeLists.txt @@ -0,0 +1,6 @@ +add_subdirectory(memory) +add_subdirectory(platform) +add_subdirectory(framework) +add_subdirectory(operators) +add_subdirectory(pybind) +add_subdirectory(inference) diff --git a/paddle/framework/.clang-format b/paddle/fluid/framework/.clang-format similarity index 100% rename from paddle/framework/.clang-format rename to paddle/fluid/framework/.clang-format diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef1bc07c2dbe71268c706a119056d3a9fcfc7f8c --- /dev/null +++ b/paddle/fluid/framework/CMakeLists.txt @@ -0,0 +1,98 @@ +# ddim lib +proto_library(framework_proto SRCS framework.proto) + +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) +cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) +nv_test(dim_test SRCS dim_test.cu DEPS ddim) + +if (WITH_GPU) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) +else() + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) +endif () + +cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) +if (WITH_GPU) + nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor) +else() + cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) +endif() + +cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) + +nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) +cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) +nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) + +cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) + +cc_test(variable_test SRCS variable_test.cc) + +cc_library(threadpool SRCS threadpool.cc DEPS enforce) +cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) + +cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_test(scope_test SRCS scope_test.cc DEPS scope) + +cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) +nv_test(data_device_transform_test SRCS data_device_transform_test.cu + DEPS operator op_registry init math_function) + +cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) +cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) + +cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function) +cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) + +cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor + framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) + +cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) +cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc +device_context) +cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) +cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) +cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) +cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) +cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog + shape_inference data_transform lod_tensor) +cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) + +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) +nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) + +py_proto_compile(framework_py_proto SRCS framework.proto) +# Generate an empty __init__.py to make framework_py_proto as a valid python module. +add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) +add_dependencies(framework_py_proto framework_py_proto_init) +add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto + COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/ + COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +cc_library(backward SRCS backward.cc DEPS net_op) +cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) +cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) + +cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) + +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope +framework_proto backward glog lod_rank_table profiler feed_fetch_method) + +cc_library(prune SRCS prune.cc DEPS framework_proto) +cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) +cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry + proto_desc) +cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) +cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) + +cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator) +cc_test(init_test SRCS init_test.cc DEPS init) + +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) +cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) + +cc_test(channel_test SRCS channel_test.cc) diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d7e7366b0723c630b24d62c1f5d0a72cf42d770 --- /dev/null +++ b/paddle/fluid/framework/attribute.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/attribute.h" + +#include + +namespace paddle { +namespace framework { + +Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { + switch (attr_desc.type()) { + case proto::AttrType::BOOLEAN: { + return attr_desc.b(); + } + case proto::AttrType::INT: { + return attr_desc.i(); + } + case proto::AttrType::FLOAT: { + return attr_desc.f(); + } + case proto::AttrType::STRING: { + return attr_desc.s(); + } + case proto::AttrType::BOOLEANS: { + std::vector val(attr_desc.bools_size()); + for (int i = 0; i < attr_desc.bools_size(); ++i) { + val[i] = attr_desc.bools(i); + } + return val; + } + case proto::AttrType::INTS: { + std::vector val(attr_desc.ints_size()); + for (int i = 0; i < attr_desc.ints_size(); ++i) { + val[i] = attr_desc.ints(i); + } + return val; + } + case proto::AttrType::FLOATS: { + std::vector val(attr_desc.floats_size()); + for (int i = 0; i < attr_desc.floats_size(); ++i) { + val[i] = attr_desc.floats(i); + } + return val; + } + case proto::AttrType::STRINGS: { + std::vector val(attr_desc.strings_size()); + for (int i = 0; i < attr_desc.strings_size(); ++i) { + val[i] = attr_desc.strings(i); + } + return val; + } + case proto::AttrType::LONG: { + return attr_desc.l(); + } + default: + PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); + } + return boost::blank(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h new file mode 100644 index 0000000000000000000000000000000000000000..16be42ae71497bcc755d10eee2d73d331ede7da6 --- /dev/null +++ b/paddle/fluid/framework/attribute.h @@ -0,0 +1,284 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +template +inline proto::AttrType AttrTypeID() { + Attribute tmp = T(); + return static_cast(tmp.which() - 1); +} + +Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc); + +class AttrReader { + public: + explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {} + + template + inline const T& Get(const std::string& name) const { + PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", + name); + return boost::get(attrs_.at(name)); + } + + private: + const AttributeMap& attrs_; +}; + +// check whether a value(attribute) fit a certain limit +template +class GreaterThanChecker { + public: + explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(T& value) const { + PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); + } + + private: + T lower_bound_; +}; + +template +class EqualGreaterThanChecker { + public: + explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(T& value) const { + PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); + } + + private: + T lower_bound_; +}; + +// we can provide users more common Checker, like 'LessThanChecker', +// 'BetweenChecker'... + +template +class DefaultValueSetter { + public: + explicit DefaultValueSetter(T default_value) + : default_value_(default_value) {} + void operator()(T& value) const { value = default_value_; } + + private: + T default_value_; +}; + +template +class EnumInContainer { + public: + explicit EnumInContainer(const std::unordered_set& c) : container_(c) {} + void operator()(T& val) const { + PADDLE_ENFORCE(container_.find(val) != container_.end(), + "Value %s is not in enum container %s", val, + ContainerDebugString()); + } + + private: + std::string ContainerDebugString() const { + std::ostringstream sout; + sout << "["; + size_t cnt = 0; + for (auto& v : container_) { + sout << v; + ++cnt; + if (cnt != container_.size()) { + sout << " ,"; + } + } + sout << "]"; + return sout.str(); + } + + std::unordered_set container_; +}; + +template +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + T* operator()(Attribute& attr) const { + T* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s", + attr_name_, typeid(T).name(), attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +// special handle bool +// FIXME(yuyang18): Currently we cast bool into int in python binding. It is +// hard to change the logic there. In another way, we should correct handle +// if the user set `some_flag=1`. +// +// FIX ME anytime if there is a better solution. +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + bool* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + float val = boost::get(attr); + attr = static_cast(val); + } + bool* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s", + attr_name_, attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + int64_t* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } + int64_t* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +// check whether a certain attribute fit its limits +// an attribute can have more than one limits +template +class TypedAttrChecker { + typedef std::function ValueChecker; + + public: + explicit TypedAttrChecker(const std::string& attr_name) + : attr_name_(attr_name) {} + + TypedAttrChecker& InEnum(const std::unordered_set& range) { + value_checkers_.push_back(EnumInContainer(range)); + return *this; + } + + TypedAttrChecker& GreaterThan(const T& lower_bound) { + value_checkers_.push_back(GreaterThanChecker(lower_bound)); + return *this; + } + + TypedAttrChecker& EqualGreaterThan(const T& lower_bound) { + value_checkers_.push_back(EqualGreaterThanChecker(lower_bound)); + return *this; + } + + // we can add more common limits, like LessThan(), Between()... + + TypedAttrChecker& SetDefault(const T& default_value) { + PADDLE_ENFORCE(default_value_setter_.empty(), + "%s can't have more than one default value!", attr_name_); + default_value_setter_.push_back(DefaultValueSetter(default_value)); + return *this; + } + + // allow users provide their own checker + TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) { + value_checkers_.push_back(checker); + return *this; + } + + void operator()(AttributeMap& attr_map) const { + if (!attr_map.count(attr_name_)) { + // user do not set this attr + PADDLE_ENFORCE(!default_value_setter_.empty(), + "Attribute '%s' is required!", attr_name_); + // default_value_setter_ has no more than one element + T val; + (default_value_setter_[0])(val); + attr_map[attr_name_] = val; + } + Attribute& attr = attr_map.at(attr_name_); + ExtractAttribute extract_attr(attr_name_); + T* attr_value = extract_attr(attr); + for (const auto& checker : value_checkers_) { + checker(*attr_value); + } + } + + private: + std::string attr_name_; + std::vector value_checkers_; + std::vector default_value_setter_; +}; + +// check whether op's all attributes fit their own limits +class OpAttrChecker { + typedef std::function AttrChecker; + + public: + template + TypedAttrChecker& AddAttrChecker(const std::string& attr_name) { + attr_checkers_.push_back(TypedAttrChecker(attr_name)); + AttrChecker& checker = attr_checkers_.back(); + return *(checker.target>()); + } + + void Check(AttributeMap& attr_map) const { + for (const auto& checker : attr_checkers_) { + checker(attr_map); + } + } + + private: + std::vector attr_checkers_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4795f4fc5c73034b23305162ea3b710480d8ebc --- /dev/null +++ b/paddle/fluid/framework/backward.cc @@ -0,0 +1,585 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/backward.h" +#include "paddle/fluid/operators/net_op.h" + +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace framework { + +static std::unordered_set* g_ctrl_flow_ops_ = nullptr; +// Control Flow operators's backward is significantly different from +// computational operators. Hack Code here. +// We should design a better way to backward CtrlFlowOps. +static std::unordered_set& CtrlFlowOps() { + if (g_ctrl_flow_ops_ == nullptr) { + g_ctrl_flow_ops_ = new std::unordered_set{ + "increment", "lod_rank_table", "less_than"}; + } + return *g_ctrl_flow_ops_; +} + +static inline std::unique_ptr CreateGradOp( + const OperatorBase& op, const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var) { + OpDesc op_desc; + op_desc.SetInputMap(op.Inputs()); + op_desc.SetOutputMap(op.Outputs()); + op_desc.SetType(op.Type()); + op_desc.SetAttrMap(op.Attrs()); + auto& info = OpInfoMap::Instance().Get(op.Type()); + auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {}); + std::vector> grad_ops; + grad_ops.reserve(grad_descs.size()); + std::transform(grad_descs.begin(), grad_descs.end(), + std::back_inserter(grad_ops), + [](const std::unique_ptr& grad_desc) { + return OpRegistry::CreateOp(*grad_desc); + }); + PADDLE_ENFORCE(!grad_ops.empty()); + if (grad_ops.size() == 1) { + return std::move(grad_ops[0]); + } else { + auto net_op = new operators::NetOp(); + for (auto& grad_op : grad_ops) { + net_op->AppendOp(std::move(grad_op)); + } + net_op->CompleteAddOp(); + return std::unique_ptr(net_op); + } +} + +template +static void ForEachVarName(const Map& names, T callback) { + for (auto& name : names) { + for (auto& n : name.second) { + if (callback(n)) return; + } + } +} + +// return whether all the names + suffixes in the set +static bool AllInSet( + const std::map>& names, + const std::string& suffix, const std::unordered_set& set) { + bool all_in_set = true; + ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) { + all_in_set = set.find(n + suffix) != set.end(); + return !all_in_set; + }); + return all_in_set; +} + +static std::unique_ptr NOP() { + auto net_op = new operators::NetOp(); + net_op->SetType("@NOP@"); + net_op->CompleteAddOp(); + return std::unique_ptr(net_op); +} + +// Get backward operator from a forward operator, a recursive implementation. +// +// no_grad_names the gradient variable names without gradient calculating. +// +// uniq_id is a unique index used inside recursively calling +// BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and +// pass `uniq_id` through recursive calling. +// +// returns The backward operator. In a simple situation, it may be a simple +// operator, in a complex situation, it maybe a NetOp. +// +// See Backward.h for details +static std::unique_ptr BackwardRecursive( + const OperatorBase& forwardOp, + std::unordered_set& no_grad_names, + std::unordered_map* grad_to_var, + size_t& uniq_id) { + // If all input gradients of forwarding operator do not need to calculate, + // just return an NOP. Not return null ptr because NOP does not take + // too much time for calculation, but it is useful for simplifying logic. + if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/, + no_grad_names /*set*/)) { + return NOP(); + } + + // All output gradients of forwarding operator do not need to calculate. + // Then all input gradients cannot be computed at all, and we put them into + // `no_grad_names` set. Return an NOP. + if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/, + no_grad_names /*set*/)) { + ForEachVarName(forwardOp.Inputs(), + [&no_grad_names](const std::string& name) -> bool { + no_grad_names.insert(GradVarName(name)); + return false; + }); + return NOP(); + } + + // Returned gradient network + auto net = std::unique_ptr(new operators::NetOp()); + + if (forwardOp.IsNetOp()) { + // Because forwardOp is a net op, it can static_cast. + auto& forwardNet = static_cast(forwardOp); + + // Map from output gradient variable name to operator's indices in + // backward net's ops_. That operator generates that variable. + std::unordered_map> dup_output_ops; + + size_t local_op_id = 0; + // reversely travel forwardNet and collect all duplicate outputs. + for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend(); + ++it, ++local_op_id) { + auto& fwd = *it; + auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id); + ForEachVarName(bwd->Outputs(), + [&dup_output_ops, local_op_id](const std::string& out) { + dup_output_ops[out].emplace_back(local_op_id); + return false; + }); + net->AppendOp(std::move(bwd)); + } + // Get unique ID for this method. + auto uid = uniq_id++; + // TODO(dzh): more comment + // multiple operators which have the same output (y for example) may + // overwrite the same y variable when backward, special operations are token + // to handle this case. For each duplicate output, rename it to an alias + // (original name with a offset), append an `add` op for its operator, + // and finally sum all the alias variable to the final output variable y. + using Pos = std::pair>; + std::list insert_position; + for (auto& dup_output_op : dup_output_ops) { + const std::string& name = dup_output_op.first; + // duplicate @Empty@ don't need to be added + if (name == kEmptyVarName) continue; + + auto& dup_op = dup_output_op.second; + // no duplicate output + if (dup_op.size() == 1) continue; + + // process the duplicate outputs + std::vector dup_outputs; + for (size_t i = 0; i < dup_op.size(); ++i) { + // rename each duplicate output to an alias + auto op_offset = dup_op[i]; + dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" + + std::to_string(i)); + net->ops_[op_offset]->Rename(name, dup_outputs.back()); + } + // collect all the offset for each alias, + // insert a sum operator to add all aliases to output + insert_position.push_back( + {dup_op.back(), + OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}}, + AttributeMap{})}); + } + + // make sure the inserted `sum` ops follow the BFS order. + insert_position.sort( + [](const Pos& l, const Pos& r) { return l.first > r.first; }); + + for (auto& pos : insert_position) { + net->InsertOp(pos.first + 1, std::move(pos.second)); + } + } else { + std::unique_ptr grad_op( + CreateGradOp(forwardOp, no_grad_names, grad_to_var)); + + ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op]( + const std::string& grad_input) { + if (no_grad_names.count(grad_input)) { + // +1 for \0 + std::string prefix = grad_input.substr( + 0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); + grad_op->Rename(grad_input, prefix + kZeroVarSuffix); + + // If part of input gradient of that operator is not calculated, fill + // zero variables to that input gradient. + net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}}, + {{"Out", {grad_input}}}, + AttributeMap{})); + } + return false; + }); + + ForEachVarName(grad_op->Outputs(), + [&no_grad_names, &grad_op](const std::string& grad_output) { + if (no_grad_names.count(grad_output)) { + grad_op->Rename(grad_output, kEmptyVarName); + } + return false; + }); + + if (net->ops_.empty()) { // Current no aux op is added to network + return grad_op; + } + net->AppendOp(std::move(grad_op)); + } + net->SetType("@GENERATED_BACKWARD@"); + net->CompleteAddOp(); + return std::unique_ptr( + static_cast(net.release())); +} + +// See header for comments +std::unique_ptr Backward( + const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars) { + std::unordered_set no_grad_names; + no_grad_names.reserve(no_grad_vars.size() + 1); + + no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix); + + for (auto& name : no_grad_vars) { + no_grad_names.insert(name + kGradVarSuffix); + } + size_t uid = 0; + std::unordered_map grad_to_var; + return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid); +} + +// ==================================== // + +static bool AllGradInSet(const std::vector& names, + const std::unordered_set& set) { + for (const std::string& name : names) { + if (!set.count(GradVarName(name))) { + return false; + } + } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "All input {"; + for (auto& name : names) { + sout << name << ","; + } + sout << "} is in {"; + for (auto& name : set) { + sout << name << ","; + } + sout << "}"; + VLOG(10) << sout.str(); + } + return true; +} + +static std::string FwdName(const std::string& grad_name) { + auto pos = grad_name.find("@GRAD"); + if (pos == std::string::npos) { + return ""; + } else { + return grad_name.substr(0, pos); + } +} + +static void CreateGradVarInBlock( + size_t grad_op_start_index, + const std::unordered_map& param_name_map, + BlockDesc* block_desc, + std::unordered_map* grad_var_record) { + auto ops = block_desc->AllOps(); + for (size_t op_index = grad_op_start_index; op_index < ops.size(); + ++op_index) { + std::unordered_set new_vars; + auto& ctrl_flow_ops = CtrlFlowOps(); + ForEachVarName(ops[op_index]->Outputs(), + [&](const std::string& grad_var_name) { + if (ctrl_flow_ops.find(ops[op_index]->Type()) != + ctrl_flow_ops.end()) { + if (block_desc->HasVarRecursive(grad_var_name)) { + return false; + } + } else { + if (block_desc->HasVar(grad_var_name)) { + return false; + } + } + if (grad_var_name == framework::kEmptyVarName) { + return false; + } + auto var = block_desc->Var(grad_var_name); + VLOG(10) << "Creating Variable " << grad_var_name; + new_vars.insert(var->Name()); + auto it = param_name_map.find(grad_var_name); + if (it == param_name_map.end()) { + return false; + } + auto param_var_name = it->second; + auto& grad_record = (*grad_var_record)[param_var_name]; + grad_record.name_ = grad_var_name; + grad_record.block_idx_ = block_desc->ID(); + grad_record.op_idx_ = static_cast(op_index); + return false; /* not break */ + }); + ops[op_index]->InferVarType(block_desc); + for (auto& arg : ops[op_index]->OutputArgumentNames()) { + if (new_vars.find(arg) == new_vars.end()) { + continue; + } + auto pname = FwdName(arg); + auto* param = block_desc->FindVarRecursive(pname); + auto* grad = block_desc->FindVar(arg); + if (param == nullptr) { + grad->SetDataType(proto::DataType::FP32); + } else { + grad->SetDataType(param->GetDataType()); + } + } + ops[op_index]->InferShape(*block_desc); + } +} + +std::vector> MakeOpGrad( + const OpDesc* op_desc, std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var, + const std::vector& grad_block = std::vector()) { + std::vector> grad_op_descs; + // All input gradients of forwarding operator do not need to calculate. + const std::vector& inputs = op_desc->InputArgumentNames(); + if (AllGradInSet(inputs, *no_grad_vars)) { + VLOG(10) << "Drop operator " << op_desc->Type(); + return grad_op_descs; // empty vector + } + + // All output gradients of forwarding operator do not need to calculate. + const std::vector& outputs = op_desc->OutputArgumentNames(); + + if (AllGradInSet(outputs, *no_grad_vars)) { + VLOG(10) << "Drop operator " << op_desc->Type(); + // FIXME: Hack code here + auto& ctrl_flow_ops = CtrlFlowOps(); + if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) { + // Only computational op need drop input's gradient. + for (const std::string& name : inputs) { + no_grad_vars->insert(GradVarName(name)); + VLOG(10) << " Also drop " << GradVarName(name); + } + } + + return grad_op_descs; // empty vector + } + + grad_op_descs = + OpInfoMap::Instance() + .Get(op_desc->Type()) + .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block); + + std::list> pending_fill_zeros_ops; + for (auto& desc : grad_op_descs) { + for (const std::string& in_name : desc->InputArgumentNames()) { + if (no_grad_vars->count(in_name)) { + std::string prefix = in_name.substr( + 0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); + std::string new_name = prefix + kZeroVarSuffix; + desc->Rename(in_name, new_name); + std::unique_ptr fill_zeros_op( + new OpDesc("fill_zeros_like", {{"X", {prefix}}}, + {{"Out", {new_name}}}, AttributeMap{})); + pending_fill_zeros_ops.push_back(std::move(fill_zeros_op)); + } + } + } + + for (auto& p : pending_fill_zeros_ops) { + grad_op_descs.insert(grad_op_descs.begin(), std::move(p)); + } + return grad_op_descs; +} + +static BlockDesc* CreateStepBlock( + ProgramDesc& program_desc, std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var, + int step_block_idx); + +std::vector> MakeBlockBackward( + ProgramDesc& program_desc, int block_idx, + std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var) { + VLOG(5) << "MakeBlockBackward"; + BlockDesc* cur_block = program_desc.MutableBlock(block_idx); + std::vector op_descs = cur_block->AllOps(); + std::unordered_map> dup_out_ops; + size_t grad_desc_idx = 0; + std::vector> backward_descs; + + for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { + VLOG(5) << "Making backward " << (*it)->Type() << " op"; + std::vector> op_grads; + + if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" || + (*it)->Type() == "parallel_do") { + int step_block_idx = (*it)->GetBlockAttr("sub_block"); + BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars, + grad_to_var, step_block_idx); + op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); + } else if ((*it)->Type() == "conditional_block") { + BlockDesc* backward_block = + CreateStepBlock(program_desc, no_grad_vars, grad_to_var, + (*it)->GetBlockAttr("sub_block")); + op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); + } else { + op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); + } + + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "Made "; + for (auto& op_grad : op_grads) { + sout << op_grad->Type() << " "; + } + VLOG(10) << sout.str(); + } + + for (const auto& desc : op_grads) { + for (const std::string& out_name : desc->OutputArgumentNames()) { + if (out_name.find("@GRAD") == std::string::npos) { + // Not all outputs of a backward operator is a gradient. Only gradient + // need to be sum. Skip variables are not gradient. + continue; + } + dup_out_ops[out_name].emplace_back(grad_desc_idx); + } + ++grad_desc_idx; + } + std::transform(op_grads.begin(), op_grads.end(), + std::back_inserter(backward_descs), + [](std::unique_ptr& ptr) { return std::move(ptr); }); + } + + VLOG(5) << "Appending Sums"; + // Check whether some variables are written more than once + std::list>> pending_sum_ops; + for (const auto& dup : dup_out_ops) { + const std::string& out_name = dup.first; + const std::vector dup_op = dup.second; + if (out_name != kEmptyVarName && dup_op.size() > 1) { + std::vector sum_op_inputs; + std::string next_g_name = out_name; + for (size_t i = 0; i < dup_op.size(); ++i) { + VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name + << " duplicated"; + std::string new_name = out_name + "@RENAME@" + std::to_string(i); + backward_descs[dup_op[i]]->RenameOutput(out_name, new_name); + backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name); + sum_op_inputs.emplace_back(new_name); + next_g_name = sum_op_inputs.back(); + } + std::unique_ptr sum_op(new OpDesc("sum", {{"X", sum_op_inputs}}, + {{"Out", {out_name}}}, + AttributeMap{})); + pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); + } + } + + pending_sum_ops.sort([](const std::pair>& a, + const std::pair>& b) { + return a.first > b.first; + }); + for (auto& p : pending_sum_ops) { + backward_descs.insert(backward_descs.begin() + p.first + 1, + std::move(p.second)); + } + + VLOG(5) << "MakeBlockBackward Finished"; + + return backward_descs; +} + +static BlockDesc* CreateStepBlock( + ProgramDesc& program_desc, std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var, + int step_block_idx) { + auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx, + no_grad_vars, grad_to_var); + BlockDesc* backward_block = + program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx)); + for (auto& ptr : backward_block_op_descs) { + backward_block->AppendAllocatedOp(move(ptr)); + } + return backward_block; +} + +ParamGradInfoMap AppendBackward( + ProgramDesc& program_desc, const VarDesc& target, + const std::unordered_set& no_grad_vars) { + std::unordered_set no_grad_var_names; + no_grad_var_names.reserve(no_grad_vars.size() + 1); + no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix); + for (auto& name : no_grad_vars) { + no_grad_var_names.insert(GradVarName(name)); + } + + const int root_block_idx = 0; + auto root_block = program_desc.MutableBlock(root_block_idx); + + std::string fill_one_op_out = GradVarName(target.Name()); + bool is_scalar = target.GetShape() == std::vector{1}; + PADDLE_ENFORCE(is_scalar, "target should be scalar"); + VLOG(3) << "backward from loss=" << target.Name() + << " data_type=" << target.GetDataType(); + std::unique_ptr fill_one_op( + new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}}, + {{"shape", std::vector{1}}, + {"value", static_cast(1.0)}, + {"dtype", target.GetDataType()}})); + // infer var type of fill_one_op + fill_one_op->InferVarType(root_block); + + root_block->AppendAllocatedOp(std::move(fill_one_op)); + size_t forward_op_num = root_block->OpSize(); + size_t forward_block_num = program_desc.Size(); + + // Insert backward operators + std::unordered_map grad_to_var; + auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx, + &no_grad_var_names, &grad_to_var); + + for (auto& ptr : backward_op_descs) { + root_block->AppendAllocatedOp(std::move(ptr)); + } + // Create Variable + + // Create target gradient variable + std::unordered_map retv; + + auto var = root_block->Var(fill_one_op_out); + var->SetDataType(target.GetDataType()); + var->SetShape(target.GetShape()); + auto& target_grad = retv[target.Name()]; + target_grad.name_ = fill_one_op_out; + target_grad.block_idx_ = root_block_idx; + target_grad.op_idx_ = static_cast(forward_op_num); + + // create grad_var for all blocks in this program + CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv); + for (size_t block_index = forward_block_num; + block_index < program_desc.Size(); ++block_index) { + CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index), + &retv); + } + return retv; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h new file mode 100644 index 0000000000000000000000000000000000000000..2ea6922426e1dad0ca9b6e1287701bca0adef5c8 --- /dev/null +++ b/paddle/fluid/framework/backward.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +// Create the backward operator from a forward operator. +// TODO(yuyang18): Add more API reference comment. +extern std::unique_ptr Backward( + const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars); + +struct GradVarInfo { + GradVarInfo() {} + GradVarInfo(const std::string& name, int block_idx, int op_idx) + : name_(name), block_idx_(block_idx), op_idx_(op_idx) {} + + bool operator==(const GradVarInfo& b) const { + return name_ == b.name_ && block_idx_ == b.block_idx_ && + op_idx_ == b.op_idx_; + } + + std::string name_; + int block_idx_; + int op_idx_; +}; + +using ParamGradInfoMap = std::unordered_map; + +ParamGradInfoMap AppendBackward( + ProgramDesc& program_desc, const VarDesc& target, + const std::unordered_set& no_grad_vars); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9604c68913f98abc4d52c84bc8fa2c02e1a6a31 --- /dev/null +++ b/paddle/fluid/framework/backward_test.cc @@ -0,0 +1,918 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/backward.h" + +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/operators/net_op.h" + +USE_NO_KERNEL_OP(fill_constant); + +namespace paddle { +namespace framework { + +using DeviceContext = platform::DeviceContext; + +class NoneOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override {} +}; + +template +class NoneKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override {} +}; + +class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { + public: + RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input X of Add"); + AddInput("b", "Bias of Add"); + AddOutput("Out", "Out of Add"); + AddComment("Add Op"); + } +}; + +class RowWiseAddGradMaker : public SingleGradOpDescMaker { + public: + using SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad_op = new OpDesc(); + grad_op->SetInput(GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(GradVarName("X"), InputGrad("X")); + grad_op->SetOutput(GradVarName("b"), InputGrad("b")); + grad_op->SetType("rowwise_add_grad"); + return std::unique_ptr(grad_op); + } +}; + +class MulOpMaker : public OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "A"); + AddInput("Y", "B"); + AddOutput("Out", "Out"); + AddAttr("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1); + AddAttr("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1); + AddComment("Mul"); + } +}; + +class SigmoidOpMaker : public OpProtoAndCheckerMaker { + public: + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "X"); + AddOutput("Out", "Y"); + AddComment("Sigmoid"); + } +}; + +class NoGradOpMaker : public OpProtoAndCheckerMaker { + public: + NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "X input"); + AddOutput("Out", "Y output"); + AddComment("NoGradOp, same input output. no Grad"); + } +}; + +class FcOp : public operators::NetOp { + public: + FcOp(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) + : NetOp(type, inputs, outputs, attrs) { + AppendOp(OpRegistry::CreateOp( + "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}}, + {{"Out", {Output("mul_result")}}}, AttributeMap{})); + auto input_b = Inputs("b"); + std::string before_act = "mul_result"; + if (input_b.size() != 0) { + AppendOp(OpRegistry::CreateOp( + "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}}, + {{"Out", {Output("add_result")}}}, AttributeMap{})); + before_act = "add_result"; + } else { + auto out_varname = Output("add_result"); + if (out_varname != kEmptyVarName) { + this->Rename(out_varname, kEmptyVarName); + } + } + + AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}}, + {{"Out", {Output("Out")}}}, AttributeMap{})); + CompleteAddOp(false); + } +}; + +class FcOpMaker : public OpProtoAndCheckerMaker { + public: + FcOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddInput("W", "w"); + AddInput("b", "b"); + AddOutput("mul_result", "").AsIntermediate(); + AddOutput("add_result", "").AsIntermediate(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class ManyOutputOpMaker : public OpProtoAndCheckerMaker { + public: + ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("x", "x"); + AddOutput("y", "y"); + AddOutput("z", "z"); + AddComment(""); + } +}; + +class FillZeroOpMaker : public OpProtoAndCheckerMaker { + public: + FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddOutput("Out", "out"); + AddComment(""); + } +}; + +class SumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "the input tensors of sum operator.").AsDuplicable(); + AddOutput("Out", "the output tensor of sum operator."); + AddComment(""); + } +}; + +class MultInOutOpMaker : public OpProtoAndCheckerMaker { + public: + MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddInput("H", "h"); + AddOutput("Y", "y"); + AddOutput("Z", "z"); + AddComment(""); + } +}; + +class MinusGradOpDescMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + std::vector> retv; + auto x_g = InputGrad("X"); + if (!x_g.empty()) { + auto *op_desc = new OpDesc(); + op_desc->SetType("scale"); + op_desc->SetInput("X", OutputGrad("Out")); + op_desc->SetOutput("Out", x_g); + op_desc->SetAttr("scale", 1.0f); + retv.emplace_back(op_desc); + } + + auto y_g = InputGrad("Y"); + if (!y_g.empty()) { + auto *op_desc = new OpDesc(); + op_desc->SetType("scale"); + op_desc->SetInput("X", OutputGrad("Out")); + op_desc->SetOutput("Out", y_g); + op_desc->SetAttr("scale", -1.0f); + retv.emplace_back(op_desc); + } + return retv; + } +}; + +class MinusOpMaker : public OpProtoAndCheckerMaker { + public: + MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddInput("Y", ""); + AddOutput("Out", ""); + AddComment("minus for unittest"); + } +}; +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +namespace ops = paddle::operators; +using EnforceNotMet = paddle::platform::EnforceNotMet; +// rowwise_add +REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker, + f::RowWiseAddGradMaker); +REGISTER_OP_CPU_KERNEL(rowwise_add, + f::NoneKernel); +REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(rowwise_add_grad, + f::NoneKernel); +// mul +REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mul_grad, + f::NoneKernel); +// sigmoid +REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sigmoid, + f::NoneKernel); +REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker); +// fill_zeros_like +REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker); +REGISTER_OP_CPU_KERNEL(fill_zeros_like, + f::NoneKernel); +// sum +REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(sum_grad, + f::NoneKernel); +// fc +REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker); +// many_output_op +REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker, + many_output_op_grad, f::NoneOp); +// mult_in_out +REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad, + f::NoneOp); +REGISTER_OP_CPU_KERNEL(mult_in_out, + f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mult_in_out_grad, + f::NoneKernel); +// minus +REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker); +REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel); +// scale +REGISTER_OPERATOR(scale, f::NoneOp); +REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel); + +TEST(Backward, simple_op_not_need_grad) { + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + ASSERT_NE(fwd, nullptr); + auto gop = f::Backward(*fwd, {"x"}); + ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName); + + auto no_input_gop = f::Backward(*fwd, {"x", "b"}); + ASSERT_NE(no_input_gop, nullptr); + ASSERT_TRUE(no_input_gop->IsNetOp()); + ASSERT_EQ(0UL, static_cast(no_input_gop.get())->ops_.size()); +} + +TEST(Backward, net_fc_backward_normal) { + std::shared_ptr fwd = + f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}}, + {{"mul_result", {"mul_res"}}, + {"add_result", {"add_re"}}, + {"Out", {"out"}}}, + f::AttributeMap{}); + ASSERT_NE(fwd, nullptr); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); + ASSERT_TRUE(gop->IsNetOp()); + auto net = static_cast(gop.get()); + + ASSERT_NO_THROW(net->DebugString()); + + ASSERT_EQ(3UL, net->ops_.size()); + + f::OperatorBase &d_sigmoid = *net->ops_[0]; + ASSERT_EQ("sigmoid_grad", d_sigmoid.Type()); + + f::OperatorBase &d_add = *net->ops_[1]; + ASSERT_EQ("rowwise_add_grad", d_add.Type()); + + f::OperatorBase &d_mul = *net->ops_[2]; + ASSERT_EQ("mul_grad", d_mul.Type()); +} + +TEST(Backward, net_fc_backward_not_have_b) { + std::shared_ptr fwd = + f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}}, + {{"mul_result", {"mul_res"}}, + {"add_result", {"add_res"}}, + {"Out", {"tmp"}}}, + f::AttributeMap{}); + ASSERT_NE(fwd, nullptr); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); + ASSERT_TRUE(gop->IsNetOp()); + auto net = static_cast(gop.get()); + + ASSERT_NO_THROW(net->DebugString()); + + ASSERT_EQ(2UL, net->ops_.size()); + + f::OperatorBase &d_sigmoid = *net->ops_[0]; + ASSERT_EQ("sigmoid_grad", d_sigmoid.Type()); + + f::OperatorBase &d_mul = *net->ops_[1]; + ASSERT_EQ("mul_grad", d_mul.Type()); +} + +TEST(Backward, net_input_of_network_not_need_grad) { + ops::NetOp net; + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}}, + {{"mul_result", {"mul_tmp_0"}}, + {"add_result", {"add_tmp_0"}}, + {"Out", {"hidden0"}}}, + f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}}, + {{"mul_result", {"mul_tmp_1"}}, + {"add_result", {"add_tmp_1"}}, + {"Out", {"hidden1"}}}, + f::AttributeMap{})); + net.CompleteAddOp(); + auto bwd = Backward(net, {"x"}); // x@GRAD is not need. + ASSERT_TRUE(bwd->IsNetOp()); + auto bwd_net = static_cast(bwd.get()); + + auto output_vars = bwd_net->OutputVars(true); + std::unordered_set all_outputs = + std::unordered_set(output_vars.begin(), output_vars.end()); + all_outputs.erase(f::kEmptyVarName); + + for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) { + ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end()); + } + + // Not Generated X + ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end()); + + ASSERT_EQ(2UL, bwd_net->ops_.size()); + ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp()); + auto first_fc_grad = static_cast(bwd_net->ops_[1].get()); + ASSERT_EQ(3UL, first_fc_grad->ops_.size()); + ASSERT_EQ(f::kEmptyVarName, + first_fc_grad->ops_[2]->Output(f::GradVarName("X"))); +} + +TEST(Backward, net_shared_weight) { + ops::NetOp net; + net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}}, + {{"Out", {"out"}}}, f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}}, + {{"Out", {"FinalOut"}}}, + f::AttributeMap{})); + net.CompleteAddOp(); + + auto bwd = f::Backward(net, std::unordered_set{}); + ASSERT_TRUE(bwd->IsNetOp()); + auto bwd_net = static_cast(bwd.get()); + ASSERT_EQ(3UL, bwd_net->ops_.size()); + ASSERT_EQ("sum", bwd_net->ops_[2]->Type()); +} + +TEST(Backward, op_all_input_are_not_need) { + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"x", "b"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_TRUE(net->ops_.empty()); +} + +TEST(Backward, op_all_output_are_not_need) { + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"out"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_TRUE(net->ops_.empty()); +} + +TEST(Backward, op_part_of_output_are_not_need) { + auto fwd = + f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}}, + {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"Z"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_EQ(net->ops_.size(), 2UL); + + auto &fill_zero = *net->ops_[0]; + ASSERT_EQ("fill_zeros_like", fill_zero.Type()); + ASSERT_EQ(1UL, fill_zero.Inputs("X").size()); + ASSERT_EQ("Z", fill_zero.Input("X")); + ASSERT_EQ(1UL, fill_zero.Outputs("Out").size()); + ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out")); + + auto &d_many_out = *net->ops_[1]; + ASSERT_EQ("many_output_op_grad", d_many_out.Type()); + ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size()); // I/O/OG + ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, + d_many_out.Input(f::GradVarName("z"))); + ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y"))); + ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x"))); +} + +TEST(Backward, op_part_of_input_are_not_need) { + auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"a"}); + auto &grad_mul = *backward; + ASSERT_EQ(grad_mul.Type(), "mul_grad"); + ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL); + ASSERT_EQ(grad_mul.Outputs().size(), 2UL); + ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName); + ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b")); + ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out")); + ASSERT_EQ(grad_mul.Input("X"), "a"); + ASSERT_EQ(grad_mul.Input("Y"), "b"); + ASSERT_EQ(grad_mul.Input("Out"), "out"); +} + +TEST(Backward, linear_net_intermediate_variable_has_no_grad) { + ops::NetOp net; + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}}, + {{"mul_result", {"mul_out1"}}, + {"add_result", {"add_out1"}}, + {"Out", {"out1"}}}, + f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}}, + {{"mul_result", {"mul_out2"}}, + {"add_result", {"tmp_out2"}}, + {"Out", {"out2"}}}, + f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}}, + {{"mul_result", {"mul_out3"}}, + {"add_result", {"tmp_out3"}}, + {"Out", {"out3"}}}, + f::AttributeMap{})); + net.CompleteAddOp(); + + auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"}); + ASSERT_TRUE(backward->IsNetOp()); + auto bwd_net = static_cast(backward.get()); + ASSERT_EQ(bwd_net->ops_.size(), 3UL); + auto &grad_fc = *bwd_net->ops_[0]; + + const char *all = paddle::operators::NetOp::kAll; + EXPECT_EQ(grad_fc.Inputs(all).size(), + 2UL /* external input number */ + + 1UL /* external output number*/ + + 1UL /* number of gradient of external output*/ + + 2UL /* internal variable number*/ + ); + EXPECT_EQ(grad_fc.Outputs(all).size(), + 2UL /* input number of mul*/ + + 2UL /* input number of rowwise_add*/ + + 1UL /* input number of sigmod */ + - 1UL /* out2 is not needed*/); + EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL); + EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL); + EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL); + EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL); +} + +TEST(Backward, simple_single_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + f::OpDesc *op = block->AppendOp(); + op->SetType("rowwise_add"); + op->SetInput("X", {"x"}); + op->SetInput("b", {"b"}); + op->SetOutput("Out", {"out"}); + + auto target = f::VarDesc("out"); + target.SetShape({1}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 3UL); + f::OpDesc *fill_op = block->AllOps()[1]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op = block->AllOps()[2]; + EXPECT_EQ(grad_op->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op->InputNames().size(), 1UL); + ASSERT_EQ(grad_op->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out")})); + EXPECT_EQ(grad_op->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x")})); + EXPECT_EQ(grad_op->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b")})); + + EXPECT_EQ(var_to_grad.size(), 3UL); + EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2)); + EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("b"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("x"))); +} + +TEST(Backward, default_attribute) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op = block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {"x"}); + op->SetInput("Y", {"y"}); + op->SetOutput("Out", {"out"}); + op->CheckAttrs(); + + auto target = f::VarDesc("out"); + target.SetShape({1}); + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 3UL); + EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1); + EXPECT_EQ(boost::get(op->GetAttr("y_num_col_dims")), 1); + + f::OpDesc *fill_op = block->AllOps()[1]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op = block->AllOps()[2]; + ASSERT_EQ(grad_op->Type(), "mul_grad"); + EXPECT_EQ(boost::get(grad_op->GetAttr("x_num_col_dims")), 1); + EXPECT_EQ(boost::get(grad_op->GetAttr("y_num_col_dims")), 1); +} + +TEST(Backward, simple_mult_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("rowwise_add"); + op1->SetInput("X", {"x1"}); + op1->SetInput("b", {"b1"}); + op1->SetOutput("Out", {"out1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mul"); + op2->SetInput("X", {"out1"}); + op2->SetInput("Y", {"y2"}); + op2->SetOutput("Out", {"out2"}); + + f::OpDesc *op3 = block->AppendOp(); + op3->SetType("rowwise_add"); + op3->SetInput("X", {"out2"}); + op3->SetInput("b", {"b3"}); + op3->SetOutput("Out", {"out3"}); + + auto target = f::VarDesc("out3"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 6UL + 1); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op1 = block->AllOps()[6]; + EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 1UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b1")})); + + f::OpDesc *grad_op2 = block->AllOps()[5]; + EXPECT_EQ(grad_op2->Type(), "mul_grad"); + ASSERT_EQ(grad_op2->InputNames().size(), 4UL); + ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op2->Input("X"), std::vector({"out1"})); + EXPECT_EQ(grad_op2->Input("Y"), std::vector({"y2"})); + EXPECT_EQ(grad_op2->Input("Out"), std::vector({"out2"})); + EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out2")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")), + std::vector({f::GradVarName("y2")})); + + f::OpDesc *grad_op3 = block->AllOps()[4]; + EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op3->InputNames().size(), 1UL); + ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out3")})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out2")})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b3")})); + + EXPECT_EQ(var_to_grad.size(), 7UL); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("out1"), + f::GradVarInfo(f::GradVarName("out1"), 0, 5)); + EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5)); + EXPECT_EQ(var_to_grad.at("out2"), + f::GradVarInfo(f::GradVarName("out2"), 0, 4)); + EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("y2"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out2"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b3"))); +} + +TEST(Backward, intermedia_var_no_grad) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("rowwise_add"); + op1->SetInput("X", {"x1"}); + op1->SetInput("b", {"b1"}); + op1->SetOutput("Out", {"out1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mul"); + op2->SetInput("X", {"x2"}); + op2->SetInput("Y", {"y2"}); + op2->SetOutput("Out", {"out2"}); + + f::OpDesc *op3 = block->AppendOp(); + op3->SetType("rowwise_add"); + op3->SetInput("X", {"out2"}); + op3->SetInput("b", {"b3"}); + op3->SetOutput("Out", {"out3"}); + + f::OpDesc *op4 = block->AppendOp(); + op4->SetType("mul"); + op4->SetInput("X", {"out1"}); + op4->SetInput("Y", {"out3"}); + op4->SetOutput("Out", {"out4"}); + + auto target = f::VarDesc("out4"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = AppendBackward(program, target, {"out3"}); + + ASSERT_EQ(block->AllOps().size(), 7UL); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op1 = block->AllOps()[6]; + EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 1UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b1")})); + + f::OpDesc *grad_op4 = block->AllOps()[5]; + EXPECT_EQ(grad_op4->Type(), "mul_grad"); + ASSERT_EQ(grad_op4->InputNames().size(), 4UL); + ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op4->Input("X"), std::vector({"out1"})); + EXPECT_EQ(grad_op4->Input("Y"), std::vector({"out3"})); + EXPECT_EQ(grad_op4->Input("Out"), std::vector({"out4"})); + EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out4")})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector()); + + EXPECT_EQ(var_to_grad.size(), 4UL); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("out1"), + f::GradVarInfo(f::GradVarName("out1"), 0, 5)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out1"))); +} + +TEST(Backward, var_no_grad) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("mult_in_out"); + op1->SetInput("X", {"x1"}); + op1->SetInput("H", {"h1"}); + op1->SetOutput("Y", {"y1"}); + op1->SetOutput("Z", {"z1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mult_in_out"); + op2->SetInput("X", {"y1"}); + op2->SetInput("H", {"z1"}); + op2->SetOutput("Y", {"y2"}); + op2->SetOutput("Z", {"z2"}); + + auto target = f::VarDesc("z2"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = AppendBackward(program, target, {"z1"}); + + ASSERT_EQ(block->AllOps().size(), 6UL); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op2 = block->AllOps()[3]; + ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad"); + ASSERT_EQ(grad_op2->InputNames().size(), 6UL); + ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op2->Input("X"), std::vector({"y1"})); + EXPECT_EQ(grad_op2->Input("H"), std::vector({"z1"})); + EXPECT_EQ(grad_op2->Input("Y"), std::vector({"y2"})); + EXPECT_EQ(grad_op2->Input("Z"), std::vector({"z2"})); + EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")), + std::vector({f::GradVarName("y2")})); + EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")), + std::vector({f::GradVarName("z2")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("X")), + std::vector({f::GradVarName("y1")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector()); + + f::OpDesc *fill_zero_op = block->AllOps()[4]; + ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like"); + ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL); + ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL); + EXPECT_EQ(fill_zero_op->Input("X"), std::vector({"z1"})); + EXPECT_EQ(fill_zero_op->Output("Out"), + std::vector({std::string("z1") + f::kZeroVarSuffix})); + + f::OpDesc *grad_op1 = block->AllOps()[5]; + ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 6UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input("X"), std::vector({"x1"})); + EXPECT_EQ(grad_op1->Input("H"), std::vector({"h1"})); + EXPECT_EQ(grad_op1->Input("Y"), std::vector({"y1"})); + EXPECT_EQ(grad_op1->Input("Z"), std::vector({"z1"})); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")), + std::vector({f::GradVarName("y1")})); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")), + std::vector({std::string("z1") + f::kZeroVarSuffix})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("H")), + std::vector({f::GradVarName("h1")})); + + EXPECT_EQ(var_to_grad.size(), 4UL); + EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3)); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5)); + EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("y1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("h1"))); +} + +TEST(Backward, shared_var) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("rowwise_add"); + op1->SetInput("X", {"x1"}); + op1->SetInput("b", {"b1"}); + op1->SetOutput("Out", {"out1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mul"); + op2->SetInput("X", {"out1"}); + op2->SetInput("Y", {"y2"}); + op2->SetOutput("Out", {"out2"}); + + f::OpDesc *op3 = block->AppendOp(); + op3->SetType("rowwise_add"); + op3->SetInput("X", {"out1"}); + op3->SetInput("b", {"b3"}); + op3->SetOutput("Out", {"out3"}); + + auto target = f::VarDesc("out3"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 8UL); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op3 = block->AllOps()[4]; + ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op3->InputNames().size(), 1UL); + ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out3")})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1") + "@RENAME@0"})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b3")})); + + f::OpDesc *grad_op4 = block->AllOps()[5]; + ASSERT_EQ(grad_op4->Type(), "mul_grad"); + ASSERT_EQ(grad_op4->InputNames().size(), 4UL); + ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op4->Input("X"), std::vector({"out1"})); + EXPECT_EQ(grad_op4->Input("Y"), std::vector({"y2"})); + EXPECT_EQ(grad_op4->Input("Out"), std::vector({"out2"})); + EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out2")})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1") + "@RENAME@1"})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), + std::vector({f::GradVarName("y2")})); + + f::OpDesc *sum_op = block->AllOps()[6]; + ASSERT_EQ(sum_op->Type(), "sum"); + ASSERT_EQ(sum_op->InputNames().size(), 1UL); + ASSERT_EQ(sum_op->OutputNames().size(), 1UL); + EXPECT_EQ(sum_op->Input("X"), + std::vector({f::GradVarName("out1") + "@RENAME@0", + f::GradVarName("out1") + "@RENAME@1"})); + EXPECT_EQ(sum_op->Output("Out"), + std::vector({f::GradVarName("out1")})); + + f::OpDesc *grad_op1 = block->AllOps()[7]; + ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 1UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b1")})); + + EXPECT_EQ(var_to_grad.size(), 6UL); + EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4)); + EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5)); + EXPECT_EQ(var_to_grad.at("out1"), + f::GradVarInfo(f::GradVarName("out1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7)); + EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("b3"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("y2"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b1"))); +} + +TEST(Backward, half_backward) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + auto *op1 = block->AppendOp(); + op1->SetType("minus"); + op1->SetInput("X", {"a"}); + op1->SetInput("Y", {"b"}); + op1->SetOutput("Out", {"out"}); + + auto target = f::VarDesc("out"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = AppendBackward(program, target, {"b"}); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + auto ops = block->AllOps(); + ASSERT_EQ(3UL, ops.size()); + + EXPECT_EQ(var_to_grad.size(), 2UL); + EXPECT_EQ(var_to_grad.at("a"), + f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1)); +} diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..9550159155c28247797a6caa5fc01c64a0c5f99f --- /dev/null +++ b/paddle/fluid/framework/block_desc.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +VarDesc *BlockDesc::Var(const std::string &name) { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } + need_update_ = true; + auto *var = new VarDesc(name); + vars_[name].reset(var); + return var; +} + +VarDesc *BlockDesc::FindVar(const std::string &name) const { + auto it = vars_.find(name); + if (it == vars_.end()) { + return nullptr; + } + return it->second.get(); +} + +bool BlockDesc::HasVar(const std::string &name) const { + return vars_.find(name) != vars_.end(); +} + +VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const { + if (name == kEmptyVarName) return nullptr; + + auto it = vars_.find(name); + if (it == vars_.end()) { + return Parent() == kNoneBlockIndex ? nullptr + : ParentBlock()->FindVarRecursive(name); + } + return it->second.get(); +} + +VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) { + VarDesc *res = FindVarRecursive(name_bytes); + if (res == nullptr) { + res = Var(name_bytes); + } + return *res; +} + +bool BlockDesc::HasVarRecursive(const std::string &name) const { + return FindVarRecursive(name) != nullptr; +} + +std::vector BlockDesc::AllVars() const { + std::vector res; + for (const auto &p : vars_) { + res.push_back(p.second.get()); + } + return res; +} + +OpDesc *BlockDesc::AppendOp() { + need_update_ = true; + ops_.emplace_back(new OpDesc(this)); + return ops_.back().get(); +} + +void BlockDesc::AppendAllocatedOp(std::unique_ptr &&op_desc) { + need_update_ = true; + ops_.emplace_back(std::move(op_desc)); +} + +OpDesc *BlockDesc::PrependOp() { + need_update_ = true; + ops_.emplace_front(new OpDesc(this)); + return ops_.front().get(); +} + +void BlockDesc::RemoveOp(size_t s, size_t e) { + if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { + return; + } + need_update_ = true; + for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { + auto names = (*it)->InputArgumentNames(); + for (auto n : names) { + // TODO(typhoonzero): delete vars if no other op use it. + VLOG(3) << "deleting var " << n; + } + } + ops_.erase(ops_.begin() + s, ops_.begin() + e); +} + +std::vector BlockDesc::AllOps() const { + std::vector res; + for (const auto &op : ops_) { + res.push_back(op.get()); + } + return res; +} + +void BlockDesc::Flush() { + for (auto &op_desc : ops_) { + op_desc->Flush(); + } + + if (need_update_) { + auto &op_field = *this->desc_->mutable_ops(); + this->ClearPBOps(); + op_field.Reserve(static_cast(ops_.size())); + for (auto &op_desc : ops_) { + op_field.AddAllocated(op_desc->Proto()); + } + auto &var_field = *this->desc_->mutable_vars(); + this->ClearPBVars(); + var_field.Reserve(static_cast(vars_.size())); + for (auto &var_desc : vars_) { + var_field.AddAllocated(var_desc.second->Proto()); + } + need_update_ = false; + } +} + +BlockDesc *BlockDesc::ParentBlock() const { + if (this->desc_->parent_idx() == kNoneBlockIndex) { + return nullptr; + } + return prog_->MutableBlock(static_cast(this->desc_->parent_idx())); +} + +proto::BlockDesc *BlockDesc::Proto() { + Flush(); + return desc_; +} + +BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) + : prog_(prog), desc_(desc), need_update_(false) { + for (const proto::VarDesc &var_desc : desc_->vars()) { + vars_[var_desc.name()].reset(new VarDesc(var_desc)); + } + for (const proto::OpDesc &op_desc : desc_->ops()) { + ops_.emplace_back(new OpDesc(op_desc, prog, this)); + } +} + +BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, + ProgramDesc *prog) + : prog_(prog), desc_(desc) { + need_update_ = true; + for (auto &op : other.ops_) { + ops_.emplace_back(new OpDesc(*op->Proto(), prog, this)); + } + for (auto &it : other.vars_) { + auto *var = new VarDesc(*it.second); + vars_[it.first].reset(var); + } +} + +void BlockDesc::ClearPBOps() { + auto ops = this->desc_->mutable_ops(); + while (!ops->empty()) { + // we do not own the OpDesc, so release the ownership. + ops->ReleaseLast(); + } +} + +void BlockDesc::ClearPBVars() { + auto vars = this->desc_->mutable_vars(); + while (!vars->empty()) { + // we do not own the VarDesc, so release the ownership. + vars->ReleaseLast(); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..5f7eca3878ff6174090c7b0dd4904f5604ac8dc6 --- /dev/null +++ b/paddle/fluid/framework/block_desc.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class ProgramDesc; + +// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize +// read/write speed. Only when we want the protobuf message, the local changes +// will be synchronized (by `Sync` method). + +class BlockDesc { + public: + BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc); + + BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog); + + ~BlockDesc() { + this->ClearPBVars(); + this->ClearPBOps(); + } + + int32_t ID() const { return desc_->idx(); } + + int32_t Parent() const { return desc_->parent_idx(); } + + VarDesc *Var(const std::string &name_bytes); + + VarDesc *FindVar(const std::string &name_bytes) const; + + bool HasVar(const std::string &var_name) const; + + VarDesc *FindVarRecursive(const std::string &name_bytes) const; + + VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes); + + bool HasVarRecursive(const std::string &var_name) const; + + std::set LocalVarNames() const { + std::set var_names; + for (auto &var : vars_) { + var_names.insert(var.first); + } + return var_names; + } + + std::vector AllVars() const; + + BlockDesc *ParentBlock() const; + + OpDesc *AppendOp(); + + void AppendAllocatedOp(std::unique_ptr &&op_desc); + + OpDesc *PrependOp(); + + void RemoveOp(size_t s, size_t e); + + std::vector AllOps() const; + + size_t OpSize() const { return ops_.size(); } + + OpDesc *Op(int idx) { return ops_.at(idx).get(); } + + void Flush(); + + proto::BlockDesc *Proto(); + + ProgramDesc *Program() { return this->prog_; } + + private: + void ClearPBOps(); + void ClearPBVars(); + + private: + ProgramDesc *prog_; // not_own + proto::BlockDesc *desc_; // not_own + bool need_update_; + + std::deque> ops_; + std::unordered_map> vars_; + + DISABLE_COPY_AND_ASSIGN(BlockDesc); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h new file mode 100644 index 0000000000000000000000000000000000000000..5acf4fb39bbeb6bd45d215c962f10f0333578c02 --- /dev/null +++ b/paddle/fluid/framework/channel.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for size_t + +namespace paddle { +namespace framework { + +// Channel is the abstract class of buffered and un-buffered channels. +template +class Channel { + public: + virtual bool Send(T*) = 0; + virtual bool Receive(T*) = 0; + virtual size_t Cap() = 0; + virtual void Close() = 0; + virtual ~Channel() {} +}; + +// Forward declaration of channel implementations. +namespace details { +template +class Buffered; +template +class UnBuffered; +} // namespace details + +template +Channel* MakeChannel(size_t buffer_size) { + if (buffer_size > 0) { + return new details::Buffered(buffer_size); + } + return new details::UnBuffered(); +} + +template +void CloseChannel(Channel* ch) { + ch->Close(); +} + +} // namespace framework +} // namespace paddle + +#include "paddle/fluid/framework/details/buffered_channel.h" +#include "paddle/fluid/framework/details/unbuffered_channel.h" diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..953fa40fec8c0480726b44760a3a4c7f59c80a85 --- /dev/null +++ b/paddle/fluid/framework/channel_test.cc @@ -0,0 +1,510 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/channel.h" + +#include +#include + +#include "gtest/gtest.h" + +using paddle::framework::Channel; +using paddle::framework::MakeChannel; +using paddle::framework::CloseChannel; +using paddle::framework::details::Buffered; +using paddle::framework::details::UnBuffered; + +void RecevingOrderEqualToSendingOrder(Channel *ch) { + unsigned sum_send = 0; + std::thread t([&]() { + for (int i = 0; i < 5; i++) { + EXPECT_EQ(ch->Send(&i), true); + sum_send += i; + } + }); + for (int i = 0; i < 5; i++) { + int recv; + EXPECT_EQ(ch->Receive(&recv), true); + EXPECT_EQ(recv, i); + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 10U); + delete ch; +} + +TEST(Channel, MakeAndClose) { + using paddle::framework::details::Buffered; + using paddle::framework::details::UnBuffered; + { + // MakeChannel should return a buffered channel is buffer_size > 0. + auto ch = MakeChannel(10); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + CloseChannel(ch); + delete ch; + } + { + // MakeChannel should return an un-buffered channel is buffer_size = 0. + auto ch = MakeChannel(0); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + CloseChannel(ch); + delete ch; + } +} + +TEST(Channel, SufficientBufferSizeDoesntBlock) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Send(&i), true); // should not block + } + + size_t out; + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), true); // should not block + EXPECT_EQ(out, i); + } + CloseChannel(ch); + delete ch; +} + +// This tests that a channel must return false +// on send and receive performed after closing the channel. +// Receive will only return false after close when queue is empty. +// By creating separate threads for sending and receiving, we make this +// function able to test both buffered and unbuffered channels. +void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { + const size_t data = 5; + std::thread send_thread{[&]() { + size_t i = data; + EXPECT_EQ(ch->Send(&i), true); // should not block + }}; + + std::thread recv_thread{[&]() { + size_t i; + EXPECT_EQ(ch->Receive(&i), true); // should not block + EXPECT_EQ(i, data); + }}; + + send_thread.join(); + recv_thread.join(); + + // After closing send should return false. Receive should + // also return false as there is no data in queue. + CloseChannel(ch); + send_thread = std::thread{[&]() { + size_t i = data; + EXPECT_EQ(ch->Send(&i), false); // should return false + }}; + recv_thread = std::thread{[&]() { + size_t i; + // should return false because channel is closed and queue is empty + EXPECT_EQ(ch->Receive(&i), false); + }}; + + send_thread.join(); + recv_thread.join(); +} + +TEST(Channel, SendReceiveClosedBufferedChannelPanics) { + size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + SendReceiveWithACloseChannelShouldPanic(ch); + delete ch; +} + +TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) { + auto ch = MakeChannel(0); + SendReceiveWithACloseChannelShouldPanic(ch); + delete ch; +} + +TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Send(&i), true); // sending should not block + } + + size_t out; + for (size_t i = 0; i < buffer_size / 2; ++i) { + EXPECT_EQ(ch->Receive(&out), true); // receiving should not block + EXPECT_EQ(out, i); + } + + CloseChannel(ch); + + for (size_t i = buffer_size / 2; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), + true); // receving should return residual values. + EXPECT_EQ(out, i); + } + + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), + false); // receiving on closed channel should return false + } + delete ch; +} + +TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + size_t sum = 0; + std::thread t([&]() { + // Try to write more than buffer size. + for (size_t i = 0; i < 2 * buffer_size; ++i) { + if (i < buffer_size) + EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations + else + EXPECT_EQ(ch->Send(&i), false); + sum += i; + } + }); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + EXPECT_EQ(sum, 45U); + + CloseChannel(ch); + t.join(); + delete ch; +} + +TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { + auto ch = MakeChannel(0); + RecevingOrderEqualToSendingOrder(ch); +} + +TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) { + auto ch = MakeChannel(10); + RecevingOrderEqualToSendingOrder(ch); +} + +void ChannelCloseUnblocksReceiversTest(Channel *ch) { + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked because of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the channel + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +void ChannelCloseUnblocksSendersTest(Channel *ch) { + using paddle::framework::details::Buffered; + using paddle::framework::details::UnBuffered; + + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + bool send_success[num_threads]; + + // Launches threads that try to write and are blocked because of no readers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + send_success[i] = false; + t[i] = std::thread( + [&](bool *ended, bool *success) { + int data = 10; + *success = ch->Send(&data); + *ended = true; + }, + &thread_ended[i], &send_success[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + if (dynamic_cast *>(ch)) { + // If ch is Buffered, atleast 4 threads must be blocked. + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (!thread_ended[i]) ct++; + } + EXPECT_GE(ct, 4); + } else { + // If ch is UnBuffered, all the threads should be blocked. + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + } + // Explicitly close the thread + // This should unblock all senders + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + if (dynamic_cast *>(ch)) { + // Verify that only 1 send was successful + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + // Only 1 send must be successful + EXPECT_EQ(ct, 1); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +// This tests that closing a buffered channel also unblocks +// any receivers waiting on the channel +TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(1); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +// This tests that closing a buffered channel also unblocks +// any senders waiting for channel to have write space +TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(1); + ChannelCloseUnblocksSendersTest(ch); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any receivers waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(0); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any senders waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(0); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +TEST(Channel, UnbufferedLessReceiveMoreSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + // Send should block after three iterations + // since we only have three receivers. + std::thread t([&]() { + // Try to send more number of times + // than receivers + for (int i = 0; i < 4; i++) { + ch->Send(&i); + sum_send += i; + } + }); + for (int i = 0; i < 3; i++) { + int recv; + ch->Receive(&recv); + EXPECT_EQ(recv, i); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + EXPECT_EQ(sum_send, 3U); + + CloseChannel(ch); + t.join(); + delete ch; +} + +TEST(Channel, UnbufferedMoreReceiveLessSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + unsigned sum_receive = 0; + // The receiver should block after 5 + // iterations, since there are only 5 senders. + std::thread t([&]() { + for (int i = 0; i < 8; i++) { + int recv; + ch->Receive(&recv); // should block after the fifth iteration. + EXPECT_EQ(recv, i); + sum_receive += i; + } + }); + for (int i = 0; i < 5; i++) { + ch->Send(&i); + sum_send += i; + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + EXPECT_EQ(sum_send, 10U); + EXPECT_EQ(sum_receive, 10U); + // send three more elements + for (int i = 5; i < 8; i++) { + ch->Send(&i); + sum_send += i; + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 28U); + EXPECT_EQ(sum_receive, 28U); + delete ch; +} + +// This tests that destroying a channel unblocks +// any senders waiting for channel to have write space +void ChannelDestroyUnblockSenders(Channel *ch) { + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + bool send_success[num_threads]; + + // Launches threads that try to write and are blocked because of no readers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + send_success[i] = false; + t[i] = std::thread( + [&](bool *ended, bool *success) { + int data = 10; + *success = ch->Send(&data); + *ended = true; + }, + &thread_ended[i], &send_success[i]); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + bool is_buffered_channel = false; + if (dynamic_cast *>(ch)) is_buffered_channel = true; + + if (is_buffered_channel) { + // If channel is buffered, verify that atleast 4 threads are blocked + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (thread_ended[i] == false) ct++; + } + // Atleast 4 threads must be blocked + EXPECT_GE(ct, 4); + } else { + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + } + // Explicitly destroy the channel + delete ch; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + // Count number of successfuld sends + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + + if (is_buffered_channel) { + // Only 1 send must be successful + EXPECT_EQ(ct, 1); + } else { + // In unbuffered channel, no send should be successful + EXPECT_EQ(ct, 0); + } + + // Join all threads + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +// This tests that destroying a channel also unblocks +// any receivers waiting on the channel +void ChannelDestroyUnblockReceivers(Channel *ch) { + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked because of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + // All reads should return false + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that all threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + // delete the channel + delete ch; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { + size_t buffer_size = 1; + auto ch = MakeChannel(buffer_size); + ChannelDestroyUnblockReceivers(ch); +} + +TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) { + size_t buffer_size = 1; + auto ch = MakeChannel(buffer_size); + ChannelDestroyUnblockSenders(ch); +} + +// This tests that destroying an unbuffered channel also unblocks +// unblocks any receivers waiting for senders +TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) { + auto ch = MakeChannel(0); + ChannelDestroyUnblockReceivers(ch); +} + +TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) { + auto ch = MakeChannel(0); + ChannelDestroyUnblockSenders(ch); +} diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c6dd28455b02aa71d8ed09d8c2c81397a6f9955 --- /dev/null +++ b/paddle/fluid/framework/data_device_transform.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_device_transform.h" + +namespace paddle { +namespace framework { + +static const platform::DeviceContext* GetDeviceContext( + const platform::Place& src_place, const platform::Place& dst_place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) { + return pool.Get(src_place); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + return pool.Get(dst_place); + } else { + PADDLE_THROW( + "Currently, model parallelism is only supported between CPU and CUDA"); + } +} + +void TransDataDevice(const Tensor& in, const platform::Place& dst_place, + Tensor* out) { + VLOG(3) << "DeviceTransform in, src_place " << in.place() + << " dst_place: " << dst_place; + auto* dev_ctx = GetDeviceContext(in.place(), dst_place); + dev_ctx->Wait(); + Copy(in, dst_place, *dev_ctx, out); + dev_ctx->Wait(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..0c4559f586aaf3cc055f9b53b050b3f3a97573bd --- /dev/null +++ b/paddle/fluid/framework/data_device_transform.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +void TransDataDevice(const Tensor& in, const platform::Place& dst_place, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..f740f9b3268be31973774674bdea9eb404f718ed --- /dev/null +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input1 of test op"); + AddOutput("output", "output of test op"); + AddAttr("use_gpu", "force to use gpu kernel").SetDefault(false); + AddComment("This is test op"); + } +}; + +class TestOpWithKernel : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + if (Attr("use_gpu")) { + VLOG(3) << "force use gpu kernel"; + return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0)); + } else { + VLOG(3) << "use default kernel"; + return OpKernelType(proto::DataType::FP32, + ctx.Input("input")->place()); + } + } +}; + +template +class TestKernel : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + + const Tensor* input = ctx.Input("input"); + + std::cout << "input place:" << input->place() << std::endl; + auto* output = ctx.Output("output"); + output->Resize(input->dims()); + output->mutable_data(ctx.GetPlace()); + + operators::TransformFunctor, T, DeviceContext> functor( + input, input, output, ctx.template device_context(), + AddFunctor()); + functor.Run(); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT( + test_op, paddle::framework::TestOpWithKernel, + paddle::framework::OpKernelTestProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL( + test_op, + paddle::framework::TestKernel); +REGISTER_OP_CUDA_KERNEL( + test_op, + paddle::framework::TestKernel); + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +TEST(Operator, CPUtoGPU) { + using namespace paddle::framework; + using namespace paddle::platform; + InitDevices(); + + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + + // create an op to run on CPU + paddle::framework::proto::OpDesc cpu_op_desc; + cpu_op_desc.set_type("test_op"); + BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs()); + BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs()); + + auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc); + // prepare input + auto* in_t = scope.Var("IN1")->GetMutable(); + auto* src_ptr = in_t->mutable_data({2, 3}, CPUPlace()); + for (int i = 0; i < 2 * 3; ++i) { + src_ptr[i] = static_cast(i); + } + + // get output + auto* output = scope.Var("OUT1"); + cpu_op->Run(scope, cpu_place); + + auto* output_ptr = output->Get().data(); + for (int i = 0; i < 2 * 3; ++i) { + ASSERT_EQ(output_ptr[i], static_cast(i) * 2); + } + + // create an op to run on GPU + paddle::framework::proto::OpDesc gpu_op_desc; + gpu_op_desc.set_type("test_op"); + BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs()); + BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs()); + + auto attr = gpu_op_desc.mutable_attrs()->Add(); + attr->set_name("use_gpu"); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(true); + + auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc); + + paddle::platform::CUDAPlace cuda_place(0); + // get output + auto* output2 = scope.Var("OUT2"); + gpu_op->Run(scope, cuda_place); + VLOG(3) << "after gpu_op run"; + + // auto* output2_ptr = output2->Get().data(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(cuda_place); + + paddle::framework::Tensor output_tensor; + Copy(output2->Get(), paddle::platform::CPUPlace(), *dev_ctx, + &output_tensor); + + dev_ctx->Wait(); + float* output2_ptr = output_tensor.data(); + for (int i = 0; i < 2 * 3; ++i) { + ASSERT_EQ(output2_ptr[i], static_cast(i) * 4); + } +} diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h new file mode 100644 index 0000000000000000000000000000000000000000..b72f13f2e8f28556c195e65e3096b6ef1ba9e13a --- /dev/null +++ b/paddle/fluid/framework/data_layout.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +enum class DataLayout { + kNHWC = 0, + kNCHW = 1, + kAnyLayout = 2, +}; + +inline DataLayout StringToDataLayout(const std::string& str) { + std::string s(str); + for (size_t i = 0; i < s.size(); ++i) { + s[i] = toupper(s[i]); + } + + if (s == "NHWC") { + return DataLayout::kNHWC; + } else if (s == "NCHW") { + return DataLayout::kNCHW; + } else if (s == "ANYLAYOUT") { + return DataLayout::kAnyLayout; + } else { + PADDLE_THROW("Unknown storage order string: %s", s); + } +} + +inline std::string DataLayoutToString(const DataLayout& data_layout) { + switch (data_layout) { + case DataLayout::kNHWC: + return "NHWC"; + case DataLayout::kNCHW: + return "NCHW"; + case DataLayout::kAnyLayout: + return "ANY_LAYOUT"; + default: + PADDLE_THROW("unknown DataLayou %d", data_layout); + } +} + +inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) { + out << DataLayoutToString(l); + return out; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..c546a508fe1bc1c6e1608cb16a2e1f708a083895 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_layout_transform.h" + +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace framework { + +std::vector GetAxis(const DataLayout& from, const DataLayout& to) { + PADDLE_ENFORCE_NE(from, to, + "layout transform should transform different layout"); + if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) { + return {0, 2, 3, 1}; + } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) { + return {0, 3, 1, 2}; + } else { + PADDLE_THROW("unsupported transform"); + } +} + +struct CastDataLayout { + CastDataLayout(const platform::DeviceContext* ctx, + const std::vector& axis, const framework::Tensor& in, + framework::Tensor* out) + : in_(in), out_(out), ctx_(ctx), axis_(axis) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + const std::vector axis_; + + template + void operator()() { + auto place = ctx_->GetPlace(); + + if (platform::is_cpu_place(place)) { + operators::math::Transpose trans4; + auto* context = static_cast(ctx_); + trans4(*context, in_, out_, axis_); + } else { + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +void TransDataLayout(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_type_for_var.place_, + expected_kernel_type.place_), + "TransDataLayout only support DataLayout transform on same place!"); + + PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!"); + + auto& pool = platform::DeviceContextPool::Instance(); + + auto src_dim = in.dims(); + std::vector dst_dim; + + auto axis = GetAxis(kernel_type_for_var.data_layout_, + expected_kernel_type.data_layout_); + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + out->Resize(make_ddim(dst_dim)); + out->mutable_data(expected_kernel_type.place_, in.type()); + + framework::VisitDataType( + framework::ToDataType(in.type()), + CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out)); + + out->set_layout(expected_kernel_type.data_layout_); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..862405fbf466cd5e5fc819f42cc7392b1c4ca624 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform.h @@ -0,0 +1,31 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +std::vector GetAxis(const DataLayout& from, const DataLayout& to); + +void TransDataLayout(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..99eb46bde34b089c3da65885748a1e77fe40c700 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_layout_transform.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +TEST(DataTransform, DataLayoutFunction) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto place = CPUPlace(); + Tensor in = Tensor(); + Tensor out = Tensor(); + in.mutable_data(make_ddim({2, 3, 1, 2}), place); + in.set_layout(DataLayout::kNHWC); + + auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place, + DataLayout::kNHWC, LibraryType::kPlain); + auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place, + DataLayout::kNCHW, LibraryType::kPlain); + + TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out); + + EXPECT_TRUE(out.layout() == DataLayout::kNCHW); + EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1})); + + TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out); + + EXPECT_TRUE(in.layout() == DataLayout::kNHWC); + EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2})); +} \ No newline at end of file diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..9575d01af8875cc21061979e54ce0612d8a7f3a5 --- /dev/null +++ b/paddle/fluid/framework/data_transform.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_transform.h" + +#include "paddle/fluid/framework/data_device_transform.h" +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/data_type_transform.h" + +namespace paddle { +namespace framework { + +static void PassTensorData(Tensor* from, Tensor* to) { + to->ShareDataWith(*from); + *from = Tensor(); +} + +void DataTransform(const OpKernelType& expected_kernel_type, + const OpKernelType& kernel_type_for_var, + const Tensor& input_tensor, Tensor* output_tensor) { + bool transformed = false; + Tensor in; + in.ShareDataWith(input_tensor); + Tensor out; + + // do layout transform + if (NeedTransformLayout(expected_kernel_type.data_layout_, + kernel_type_for_var.data_layout_)) { + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + transformed = true; + PassTensorData(&out, &in); + } + + if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) { + TransDataType(kernel_type_for_var, expected_kernel_type, in, &out); + transformed = true; + PassTensorData(&out, &in); + } + + // do device transform + if (!platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_type.place_)) { + TransDataDevice(in, expected_kernel_type.place_, &out); + transformed = true; + PassTensorData(&out, &in); + } + + PADDLE_ENFORCE(transformed, "No transform is applied, please check!"); + // get output data + output_tensor->ShareDataWith(in); +} + +void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, + Variable& out_var) { + if (in_var.IsType()) { + auto& in_lod_tensor = in_var.Get(); + auto* tran_lod_tensor = out_var.GetMutable(); + tran_lod_tensor->set_lod(in_lod_tensor.lod()); + tran_lod_tensor->set_layout(in_lod_tensor.layout()); + tran_lod_tensor->ShareDataWith(tensor); + } else if (in_var.IsType()) { + auto& in_selected_rows = in_var.Get(); + auto* trans_selected_rows = out_var.GetMutable(); + trans_selected_rows->set_height(in_selected_rows.height()); + trans_selected_rows->set_rows(in_selected_rows.rows()); + trans_selected_rows->mutable_value()->ShareDataWith(tensor); + } else { + PADDLE_THROW("unknown var type"); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..70d3a174accc8beda06c550bc5ac9ee97897eb2e --- /dev/null +++ b/paddle/fluid/framework/data_transform.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +void DataTransform(const OpKernelType& expected_kernel_type, + const OpKernelType& kernel_type_for_var, + const Tensor& input_tensor, Tensor* out); + +void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, + Variable& out_var); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h new file mode 100644 index 0000000000000000000000000000000000000000..7a527f0d0c12806045d21b1cf279ccfd2cf73c8d --- /dev/null +++ b/paddle/fluid/framework/data_type.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +inline proto::DataType ToDataType(std::type_index type) { + using namespace paddle::framework::proto; + if (typeid(float).hash_code() == type.hash_code()) { + return DataType::FP32; + } else if (typeid(double).hash_code() == type.hash_code()) { + return DataType::FP64; + } else if (typeid(int).hash_code() == type.hash_code()) { + return DataType::INT32; + } else if (typeid(int64_t).hash_code() == type.hash_code()) { + return DataType::INT64; + } else if (typeid(bool).hash_code() == type.hash_code()) { + return DataType::BOOL; + } else { + PADDLE_THROW("Not supported"); + } +} + +inline std::type_index ToTypeIndex(proto::DataType type) { + using namespace paddle::framework::proto; + switch (type) { + case DataType::FP32: + return typeid(float); + case DataType::FP64: + return typeid(double); + case DataType::INT32: + return typeid(int); + case DataType::INT64: + return typeid(int64_t); + case DataType::BOOL: + return typeid(bool); + default: + PADDLE_THROW("Not support type %d", type); + } +} + +template +inline void VisitDataType(proto::DataType type, Visitor visitor) { + using namespace paddle::framework::proto; + switch (type) { + case DataType::FP32: + visitor.template operator()(); + break; + case DataType::FP64: + visitor.template operator()(); + break; + case DataType::INT32: + visitor.template operator()(); + break; + case DataType::INT64: + visitor.template operator()(); + break; + case DataType::BOOL: + visitor.template operator()(); + break; + default: + PADDLE_THROW("Not supported"); + } +} + +inline std::string DataTypeToString(const proto::DataType type) { + using namespace paddle::framework::proto; + switch (type) { + case DataType::FP16: + return "float16"; + case DataType::FP32: + return "float32"; + case DataType::FP64: + return "float64"; + case DataType::INT16: + return "int16"; + case DataType::INT32: + return "int32"; + case DataType::INT64: + return "int64"; + case DataType::BOOL: + return "bool"; + default: + PADDLE_THROW("Not support type %d", type); + } +} + +inline std::ostream& operator<<(std::ostream& out, + const proto::DataType& type) { + out << DataTypeToString(type); + return out; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..6921927305aa3a7dee801ead888737a1ab93fc8e --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +template +struct CastDataTypeFunctor { + HOSTDEVICE inline OutType operator()(InType in) const { + return static_cast(in); + } +}; + +template +struct CastDataType { + CastDataType(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx) + : in_(in), out_(out), ctx_(ctx) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + + template + void operator()() { + auto* in_begin = in_.data(); + auto* in_end = in_begin + in_.numel(); + auto* out_begin = out_->mutable_data(in_.place()); + + if (platform::is_cpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + } else { + // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type? + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + out->Resize(in.dims()); + auto src_type = kernel_type_for_var.data_type_; + auto dst_type = expected_kernel_type.data_type_; + auto ctx = pool.Get(in.place()); + + switch (src_type) { + case proto::DataType::FP32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::FP64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::INT32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::INT64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::BOOL: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + default: + PADDLE_THROW("Not support type %d", src_type); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..830cced093913839ddef2841b3de1017dc2bc426 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +using KernelTypePair = std::pair; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..88dbc51b21718e2261f1c9485177621a827485e2 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "gtest/gtest.h" + +TEST(DataTypeTransform, CPUTransform) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto place = CPUPlace(); + + Tensor in; + Tensor out; + + float* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i / 3; + } + + auto kernel_fp32 = OpKernelType(proto::DataType::FP32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp64 = OpKernelType(proto::DataType::FP64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int32 = OpKernelType(proto::DataType::INT32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + + TransDataType(kernel_fp32, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(i / 3)); + } + + TransDataType(kernel_fp32, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(i / 3)); + } +} diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc new file mode 100644 index 0000000000000000000000000000000000000000..f063ee2e6dd81e66b7b74aa23e9967d865c4d297 --- /dev/null +++ b/paddle/fluid/framework/ddim.cc @@ -0,0 +1,318 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +/// @cond HIDDEN + +template +Dim make_dim(const int64_t* d) { + return Dim(*d, make_dim(d + 1)); +} + +template <> +Dim<1> make_dim<1>(const int64_t* d) { + return Dim<1>(*d); +} + +void make_ddim(DDim& ddim, const int64_t* dims, int n) { + switch (n) { + case 1: + ddim = make_dim<1>(dims); + break; + case 2: + ddim = make_dim<2>(dims); + break; + case 3: + ddim = make_dim<3>(dims); + break; + case 4: + ddim = make_dim<4>(dims); + break; + case 5: + ddim = make_dim<5>(dims); + break; + case 6: + ddim = make_dim<6>(dims); + break; + case 7: + ddim = make_dim<7>(dims); + break; + case 8: + ddim = make_dim<8>(dims); + break; + case 9: + ddim = make_dim<9>(dims); + break; + default: + PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); + } +} + +/// @endcond + +DDim make_ddim(std::initializer_list dims) { + DDim result(make_dim(0)); + make_ddim(result, dims.begin(), dims.size()); + return result; +} + +DDim make_ddim(const std::vector& dims) { + DDim result(make_dim(0)); + make_ddim(result, &dims[0], dims.size()); + return result; +} + +DDim make_ddim(const std::vector& dims) { + std::vector res(dims.size()); + std::transform(dims.begin(), dims.end(), res.begin(), + [](int d) { return static_cast(d); }); + return make_ddim(res); +} + +/// @cond HIDDEN +// XXX For some reason, putting this in an anonymous namespace causes errors +class DynamicMutableIndexer : public boost::static_visitor { + public: + explicit DynamicMutableIndexer(int idx) : idx_(idx) {} + + template + int64_t& operator()(Dim& dim) const { + return dim[idx_]; + } + + private: + int idx_; +}; + +class DynamicConstIndexer : public boost::static_visitor { + public: + explicit DynamicConstIndexer(int idx) : idx_(idx) {} + + template + int64_t operator()(const Dim& dim) const { + return dim[idx_]; + } + + private: + int idx_; +}; + +/// @endcond + +int64_t& DDim::operator[](int idx) { + return boost::apply_visitor(DynamicMutableIndexer(idx), var); +} + +int64_t DDim::operator[](int idx) const { + return boost::apply_visitor(DynamicConstIndexer(idx), var); +} + +int DDim::size() const { return arity(*this); } + +bool DDim::operator==(DDim d) const { + if (var.which() != d.getVar().which()) { + return false; + } else { + std::vector v1 = vectorize(*this); + std::vector v2 = vectorize(d); + + for (unsigned int i = 0; i < v1.size(); i++) { + if (v1[i] != v2[i]) { + return false; + } + } + + return true; + } +} + +bool DDim::operator!=(DDim d) const { return !(*this == d); } + +DDim DDim::operator+(DDim d) const { + std::vector v1 = vectorize(*this); + std::vector v2 = vectorize(d); + + std::vector v3; + + assert(v1.size() == v2.size()); + + for (unsigned int i = 0; i < v1.size(); i++) { + v3.push_back(v1[i] + v2[i]); + } + + return make_ddim(v3); +} + +DDim DDim::operator*(DDim d) const { + std::vector v1 = vectorize(*this); + std::vector v2 = vectorize(d); + + std::vector v3; + + assert(v1.size() == v2.size()); + + for (unsigned int i = 0; i < v1.size(); i++) { + v3.push_back(v1[i] * v2[i]); + } + + return make_ddim(v3); +} + +int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } + +void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } + +/// @cond HIDDEN +struct VectorizeVisitor : public boost::static_visitor<> { + std::vector& vector; + + explicit VectorizeVisitor(std::vector& v) : vector(v) {} + + template + void operator()(const T& t) { + vector.push_back(t.head); + this->operator()(t.tail); + } + + void operator()(const Dim<1>& t) { vector.push_back(t.head); } +}; +/// @endcond + +std::vector vectorize(const DDim& ddim) { + std::vector result; + VectorizeVisitor visitor(result); + boost::apply_visitor(visitor, ddim); + return result; +} + +// NOTE: framework::vectorize converts to type int64_t +// which does not fit cudnn inputs. +std::vector vectorize2int(const DDim& ddim) { + std::vector temp = vectorize(ddim); + std::vector result(temp.begin(), temp.end()); + return result; +} + +struct ProductVisitor : public boost::static_visitor { + template + int64_t operator()(const Dim& dim) { + return product(dim); + } +}; + +int64_t product(const DDim& ddim) { + ProductVisitor visitor; + return boost::apply_visitor(visitor, ddim); +} + +struct SliceVectorizeVisitor : public boost::static_visitor<> { + std::vector& vector; + int begin; + int end; + + SliceVectorizeVisitor(std::vector& v, int b, int e) + : vector(v), begin(b), end(e) { + PADDLE_ENFORCE(begin < end, + "Begin index must be less than end index in ddim slice."); + PADDLE_ENFORCE(begin >= 0, + "Begin index can't be less than zero in ddim slice."); + } + + template + void operator()(const Dim& dim) { + if (begin == 0) { + vector.push_back(dim.head); + } else { + --begin; + } + --end; + if (end > 0) { + this->operator()(dim.tail); + } + } + + void operator()(const Dim<1>& dim) { + PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound."); + vector.push_back(dim.head); + } +}; + +DDim slice_ddim(const DDim& dim, int begin, int end) { + std::vector vec; + vec.reserve(end - begin); + SliceVectorizeVisitor visitor(vec, begin, end); + boost::apply_visitor(visitor, dim); + return make_ddim(vec); +} + +/// \cond HIDDEN + +struct ArityVisitor : boost::static_visitor { + template + int operator()(Dim) const { + return D; + } +}; + +/// \endcond + +int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } + +/// \cond HIDDEN + +struct DDimPrinter : boost::static_visitor { + std::ostream& os; + explicit DDimPrinter(std::ostream& os_) : os(os_) {} + + template + void operator()(const T& t) { + os << t; + } +}; + +/// \endcond + +std::ostream& operator<<(std::ostream& os, const DDim& ddim) { + DDimPrinter printer(os); + boost::apply_visitor(printer, ddim); + return os; +} + +DDim::DDim(std::initializer_list init_list) { + *this = make_ddim(init_list); +} + +DDim flatten_to_2d(const DDim& src, int num_col_dims) { + int rank = src.size(); + return make_ddim({product(slice_ddim(src, 0, num_col_dims)), + product(slice_ddim(src, num_col_dims, rank))}); +} + +DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } + +DDim stride(const DDim& ddim) { + std::vector strides(ddim.size()); + strides[ddim.size() - 1] = 1; + for (int i = ddim.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ddim[i + 1]; + } + return framework::make_ddim(strides); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h new file mode 100644 index 0000000000000000000000000000000000000000..750ab787abb72fa3f2984caf58354e327750aa3d --- /dev/null +++ b/paddle/fluid/framework/ddim.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { + +/** + * \brief A dynamically sized dimension. + * + * The number of dimensions must be between [1, 9]. + */ +struct DDim { + typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, + Dim<8>, Dim<9>> + DDimVar; + DDimVar var; + + DDim() : var(Dim<1>()) {} + + template + explicit DDim(const Dim& in) : var(in) {} + + /*implicit*/ DDim(std::initializer_list init_list); + + template + DDim& operator=(const Dim& in) { + var = in; + return *this; + } + + int64_t& operator[](int idx); + int64_t operator[](int idx) const; + + template + typename Visitor::result_type apply_visitor(Visitor& visitor) { + return var.apply_visitor(visitor); + } + + template + typename Visitor::result_type apply_visitor(Visitor& visitor) const { + return var.apply_visitor(visitor); + } + + DDimVar getVar() { return var; } + + bool operator==(DDim d) const; + + bool operator!=(DDim d) const; + + DDim operator+(DDim d) const; + + DDim operator*(DDim d) const; + + int size() const; +}; + +/** + * \brief Make a DDim from std::vector + * + * \param dims An vector of ints. Must be sized between [1, 9] + */ +DDim make_ddim(const std::vector& dims); + +DDim make_ddim(const std::vector& dims); + +/** + * \brief Make a DDim from an initializer list + * + * \param dims An initializer list of ints. Must be sized between [1, 9] + * + */ +DDim make_ddim(std::initializer_list dims); + +int64_t get(const DDim& dim, int idx); +void set(DDim& dim, int idx, int val); + +std::vector vectorize(const DDim& ddim); +std::vector vectorize2int(const DDim& ddim); + +int64_t product(const DDim& ddim); + +/** + * \brief Slice a ddim + * + * Slice dim with [begin, end). + * e.g. DDim d = make_ddim({1,2,3,4,5}); + * slice_ddim(d, 1, 3); ====> {2,3} + */ +DDim slice_ddim(const DDim& dim, int begin, int end); + +/** + * \brief What is the length of this dimension? + * + * \param Dynamic dimension to inspect + */ + +int arity(const DDim& ddim); + +std::ostream& operator<<(std::ostream&, const DDim&); + +// Reshape a tensor to a matrix. The matrix's first dimension(column length) +// will be the product of tensor's first `num_col_dims` dimensions. +DDim flatten_to_2d(const DDim& src, int num_col_dims); + +DDim flatten_to_1d(const DDim& src); + +DDim stride(const DDim& ddim); +} // namespace framework +} // namespace paddle + +namespace boost { + +template +T get(const paddle::framework::DDim& in) { + return boost::get(in.var); +} + +} // namespace boost diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..18d305a4036840066a7d9c999a7e73db863274d7 --- /dev/null +++ b/paddle/fluid/framework/ddim_test.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ddim.h" + +TEST(DDim, Equality) { + // construct a DDim from an initialization list + paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5}); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // construct a DDim from a vector + std::vector vec({9, 1, 5}); + paddle::framework::DDim vddim = paddle::framework::make_ddim(vec); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // mutate a DDim + ddim[1] = 2; + EXPECT_EQ(ddim[1], 2); + paddle::framework::set(ddim, 0, 6); + EXPECT_EQ(paddle::framework::get(ddim, 0), 6); + + // vectorize a DDim + std::vector res_vec = paddle::framework::vectorize(vddim); + EXPECT_EQ(res_vec[0], 9); + EXPECT_EQ(res_vec[1], 1); + EXPECT_EQ(res_vec[2], 5); + paddle::framework::Dim<3> d(3, 2, 1); + res_vec = paddle::framework::vectorize(paddle::framework::DDim(d)); + EXPECT_EQ(res_vec[0], 3); + EXPECT_EQ(res_vec[1], 2); + EXPECT_EQ(res_vec[2], 1); + + // add two DDims + paddle::framework::DDim ddim_sum = ddim + vddim; + EXPECT_EQ(ddim_sum[0], 15); + EXPECT_EQ(ddim_sum[1], 3); + EXPECT_EQ(ddim_sum[2], 10); + + // multiply two DDims + paddle::framework::DDim ddim_mul = ddim * vddim; + EXPECT_EQ(ddim_mul[0], 54); + EXPECT_EQ(ddim_mul[1], 2); + EXPECT_EQ(ddim_mul[2], 25); + + // arity of a DDim + EXPECT_EQ(paddle::framework::arity(ddim), 3); + EXPECT_EQ(ddim.size(), 3); + + // product of a DDim + EXPECT_EQ(paddle::framework::product(vddim), 45); + EXPECT_EQ( + paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})), + 90); + + // slice a DDim + paddle::framework::DDim ddim2 = + paddle::framework::make_ddim({1, 2, 3, 4, 5, 6}); + paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5); + EXPECT_EQ(arity(ss), 3); + EXPECT_EQ(ss[0], 3); + EXPECT_EQ(ss[1], 4); + EXPECT_EQ(ss[2], 5); + paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6); + EXPECT_EQ(arity(ss2), 6); + EXPECT_EQ(ss2[0], 1); + EXPECT_EQ(ss2[1], 2); + EXPECT_EQ(ss2[2], 3); + EXPECT_EQ(ss2[3], 4); + EXPECT_EQ(ss2[4], 5); + EXPECT_EQ(ss2[5], 6); +} + +TEST(DDim, Print) { + // print a DDim + std::stringstream ss; + paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4}); + ss << ddim; + EXPECT_EQ("2, 3, 4", ss.str()); +} diff --git a/paddle/fluid/framework/details/buffered_channel.h b/paddle/fluid/framework/details/buffered_channel.h new file mode 100644 index 0000000000000000000000000000000000000000..88faf3acf7c17b0cb3770a8910e400a1f6688f5f --- /dev/null +++ b/paddle/fluid/framework/details/buffered_channel.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +// Four of the properties of Buffered Channel: +// - A send to a full channel blocks temporarily until a receive from the +// channel or the channel is closed. +// - A receive from an empty channel blocks temporarily until a send to the +// channel or the channel is closed. +// - A send to a closed channel returns false immediately. +// - A receive from a closed channel returns false immediately. + +template +class Buffered : public paddle::framework::Channel { + friend Channel* paddle::framework::MakeChannel(size_t); + friend void paddle::framework::CloseChannel(Channel*); + + public: + virtual bool Send(T*); + virtual bool Receive(T*); + virtual size_t Cap() { return cap_; } + virtual void Close(); + virtual ~Buffered(); + + private: + size_t cap_; + std::mutex mu_; + std::condition_variable empty_cond_var_; + std::condition_variable full_cond_var_; + std::condition_variable destructor_cond_var_; + std::deque channel_; + std::atomic closed_{false}; + std::atomic send_ctr{0}; + std::atomic recv_ctr{0}; + + Buffered(size_t cap) : cap_(cap), closed_(false) { + PADDLE_ENFORCE_GT(cap, 0); + } + + void NotifyAllParticipants(std::unique_lock*); +}; + +template +bool Buffered::Send(T* item) { + bool ret = false; + if (closed_) { + return ret; + } + send_ctr++; + std::unique_lock lock(mu_); + full_cond_var_.wait(lock, + [this]() { return channel_.size() < cap_ || closed_; }); + if (!closed_) { + channel_.push_back(std::move(*item)); + lock.unlock(); + empty_cond_var_.notify_one(); + ret = true; + } + send_ctr--; + destructor_cond_var_.notify_one(); + return ret; +} + +template +bool Buffered::Receive(T* item) { + bool ret = false; + // Once the channel has been closed and all data has been consumed, + // just return false. Don't even try acquiring the mutex. + if (closed_ && channel_.empty()) { + return false; + } + recv_ctr++; + std::unique_lock lock(mu_); + empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; }); + if (!channel_.empty()) { + *item = std::move(channel_.front()); + channel_.pop_front(); + full_cond_var_.notify_one(); + ret = true; + } + recv_ctr--; + destructor_cond_var_.notify_one(); + return ret; +} + +template +void Buffered::Close() { + if (closed_) { + return; + } + std::unique_lock lock(mu_); + closed_ = true; + NotifyAllParticipants(&lock); +} + +template +Buffered::~Buffered() { + std::unique_lock lock(mu_); + closed_ = true; + channel_.clear(); + NotifyAllParticipants(&lock); + + // The destructor must wait for all readers and writers to complete their task + // The channel has been closed, so we will not accept new readers and writers + lock.lock(); + destructor_cond_var_.wait( + lock, [this]() { return send_ctr == 0 && recv_ctr == 0; }); +} + +template +void Buffered::NotifyAllParticipants(std::unique_lock* lock) { + lock->unlock(); + full_cond_var_.notify_all(); + empty_cond_var_.notify_all(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h new file mode 100644 index 0000000000000000000000000000000000000000..69bcea625288eba897e761a1d634f19c41dc0f79 --- /dev/null +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +// Change it to thread safe flags if needed. +class ThreadUnsafeOwnershipFlags { + public: + ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} + + ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete; + ThreadUnsafeOwnershipFlags& operator=( + const ThreadUnsafeOwnershipFlags& other) = delete; + ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default; + + void SetOwnership(bool flag) { flag_ = flag; } + + // Invoke the callback if it is not owned. + template + void AcquireOwnershipOnce(Callback acquire) { + if (!flag_) { + acquire(); + flag_ = true; + } + } + + private: + bool flag_; +}; + +// Copy-On-Write pointer. +// It will hold a T* pointer, and only copy once when `MutableData` is invoked. +// +// The template parameter OwnershipFlags should have: +// * a constructor takes a bool. True if own. +// * SetOwnership(bool flag). +// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not +// owned. +// +// https://en.wikipedia.org/wiki/Copy-on-write +template +class COWPtr { + public: + // Ctor from raw pointer. + explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} + + // Move methods. Steal ownership from origin + COWPtr(COWPtr&& other) + : payload_(other.payload_), ownership_{std::move(other.ownership_)} {} + COWPtr& operator=(COWPtr&& origin) = default; + + // Copy methods. Not own payload + COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {} + COWPtr& operator=(const COWPtr& other) { + payload_ = other.payload_; + ownership_.SetOwnership(false); + return *this; + } + + // Access read only data. + const T& Data() const { return *payload_; } + + // Access mutable data. If the data is not owned, the data will be copied + // before. + T* MutableData() { + ownership_.AcquireOwnershipOnce( + [this] { payload_.reset(new T(*payload_)); }); + return payload_.get(); + } + + private: + // Actual data pointer. + std::shared_ptr payload_; + + // Ownership flag. + OwnershipFlags ownership_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2142af277c0b356d83941b3baab1947cce31dac --- /dev/null +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/details/cow_ptr.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(COWPtr, all) { + COWPtr ptr(new int{0}); + ASSERT_EQ(ptr.Data(), 0); + COWPtr ptr2 = ptr; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(&ptr2.Data(), &ptr.Data()); + *ptr2.MutableData() = 10; + ASSERT_EQ(ptr.Data(), 0); + ASSERT_EQ(ptr2.Data(), 10); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h new file mode 100644 index 0000000000000000000000000000000000000000..d73604ad185a66ade0168f585d1951d0d7d4a5f9 --- /dev/null +++ b/paddle/fluid/framework/details/op_registry.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { +namespace details { + +enum OpInfoFillType { + kOperator = 0, + kOpProtoAndCheckerMaker = 1, + kGradOpDescMaker = 2, + kVarTypeInference = 3, + kShapeInference = 4 +}; + +template +struct OpInfoFillTypeID { + static constexpr OpInfoFillType ID() { + return std::is_base_of::value + ? kOperator + : (std::is_base_of::value + ? kOpProtoAndCheckerMaker + : (std::is_base_of::value + ? kGradOpDescMaker + : (std::is_base_of::value + ? kVarTypeInference + : (std::is_base_of::value + ? kShapeInference + : static_cast( + -1))))); + } +}; + +template ::ID()> +struct OpInfoFiller; + +template +class OperatorRegistrarRecursive; + +template +class OperatorRegistrarRecursive { + public: + using T = typename std::tuple_element>::type; + OperatorRegistrarRecursive(const char* op_type, OpInfo* info) { + OpInfoFiller fill; + fill(op_type, info); + constexpr auto size = sizeof...(ARGS); + OperatorRegistrarRecursive reg(op_type, + info); + (void)(reg); + } +}; + +template +class OperatorRegistrarRecursive { + public: + OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {} +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->creator_ = [](const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) { + return new T(type, inputs, outputs, attrs); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->proto_ = new proto::OpProto; + info->checker_ = new OpAttrChecker(); + auto maker = T(info->proto_, info->checker_); + maker.Validate(); + info->proto_->set_type(op_type); + PADDLE_ENFORCE( + info->proto_->IsInitialized(), + "Fail to initialize %s's OpProto, because %s is not initialized", + op_type, info->proto_->InitializationErrorString()); + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->grad_op_maker_ = []( + const OpDesc& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block) { + T maker(fwd_op, no_grad_set, grad_to_var, grad_block); + return maker(); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) { + T inference; + inference(fwd_op, block); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_shape_ = [](InferShapeContext* ctx) { + T inference; + inference(ctx); + }; + } +}; + +} // namespace details + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/unbuffered_channel.h b/paddle/fluid/framework/details/unbuffered_channel.h new file mode 100644 index 0000000000000000000000000000000000000000..5c9424928cb7029aac813e7b2f29f81a0093f836 --- /dev/null +++ b/paddle/fluid/framework/details/unbuffered_channel.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/channel.h" + +namespace paddle { +namespace framework { +namespace details { + +// Four of the properties of UnBuffered Channel: +// - A send to a channel blocks temporarily until a receive from the +// channel or the channel is closed. +// - A receive from a channel blocks temporarily until a send to the +// channel or the channel is closed. +// - A send to a closed channel returns false immediately. +// - A receive from a closed channel returns false immediately. +template +class UnBuffered : public paddle::framework::Channel { + friend Channel* paddle::framework::MakeChannel(size_t); + friend void paddle::framework::CloseChannel(Channel*); + + public: + virtual bool Send(T*); + virtual bool Receive(T*); + virtual size_t Cap() { return 0; } + virtual void Close(); + virtual ~UnBuffered(); + + private: + std::mutex mu_ch_; + // Mutex for readers and writers who are waiting for other reader + // and writer to complete execution + std::recursive_mutex mu_read_, mu_write_; + // reader_found_ is set true when a reader is ready to accept data + // writer_found_ is set true when a writer is ready to send data + // A transaction occurs only when both are true + std::atomic reader_found_{false}, writer_found_{false}; + std::condition_variable cv_channel_; + std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_; + T* item{nullptr}; + std::atomic closed_{false}; + std::atomic send_ctr{0}; + std::atomic recv_ctr{0}; + + UnBuffered() : closed_(false) {} + + void NotifyAllParticipants(std::unique_lock*); +}; + +// This function implements the concept of how data should +// be sent from a writer to a reader. +template +bool UnBuffered::Send(T* data) { + bool ret = false; + if (closed_) { + return ret; + } + send_ctr++; + // Prevent other writers from entering + std::unique_lock writer_lock(mu_write_); + writer_found_ = true; + std::unique_lock cv_lock(mu_write_); + // If writer comes first, it should wait till a reader arrives + cv_writer_.wait(cv_lock, + [this]() { return reader_found_ == true || closed_; }); + cv_reader_.notify_one(); + if (!closed_) { + std::unique_lock channel_lock(mu_ch_); + item = data; + channel_lock.unlock(); + cv_channel_.notify_one(); + channel_lock.lock(); + cv_channel_.wait(channel_lock, + [this]() { return item == nullptr || closed_; }); + ret = true; + } + writer_found_ = false; + send_ctr--; + cv_destructor_.notify_one(); + return ret; +} + +// This function implements the concept of how +// data that was sent by a writer is read from a reader. +template +bool UnBuffered::Receive(T* data) { + bool ret = false; + // If channel is closed, we don't even want any reader to enter. + // Unlike a buffered channel, an unbuffered channel does not allow + // readers to read after closing because there is no buffer to be consumed. + if (closed_) return ret; + recv_ctr++; + // Prevent other readers from entering + std::unique_lock read_lock{mu_read_}; + reader_found_ = true; + std::unique_lock cv_lock{mu_read_}; + // If reader comes first, it should wait till a writer arrives + cv_reader_.wait(cv_lock, + [this]() { return writer_found_ == true || closed_; }); + cv_writer_.notify_one(); + if (!closed_) { + std::unique_lock lock_ch{mu_ch_}; + // Reader should wait for the writer to first write its data + cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; }); + if (!closed_) { + *data = std::move(*item); + item = nullptr; + lock_ch.unlock(); + ret = true; + } + cv_channel_.notify_one(); + } + reader_found_ = false; + recv_ctr--; + cv_destructor_.notify_one(); + return ret; +} + +// This function implements the sequence of events +// that take place once the channel is closed. +template +void UnBuffered::Close() { + if (closed_) { + return; + } + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); +} + +// This function implements the sequence of events +// that are executed once the object of an UnBuffered +// channel is destroyed. +template +UnBuffered::~UnBuffered() { + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); + lock.lock(); + cv_destructor_.wait(lock, + [this]() { return send_ctr == 0 && recv_ctr == 0; }); +} + +// This function notifies all the readers, writers and +// the channel condition variables. +template +void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) { + lock->unlock(); + cv_writer_.notify_all(); + cv_channel_.notify_all(); + cv_reader_.notify_all(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h new file mode 100644 index 0000000000000000000000000000000000000000..3938fd3df5b54443fcbaebf600840ccf2337a173 --- /dev/null +++ b/paddle/fluid/framework/dim.h @@ -0,0 +1,421 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +// Statically sized, statically indexed dimension +template +struct Dim { + static constexpr int dimensions = i; + + template + HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { + static_assert(sizeof...(_tail) == i - 1, + "Dim initialized with the wrong number of parameters"); + } + + HOSTDEVICE + Dim(int64_t _head, const Dim& _tail) : head(_head), tail(_tail) {} + + HOSTDEVICE + Dim() : head(0), tail() {} + + /** Construct a Dim from a linear index and size. Uses Fortran order + * indexing. */ + HOSTDEVICE + Dim(int64_t idx, const Dim& size) + : head(idx % size.head), tail(idx / size.head, size.tail) {} + + /** Construct a Dim with each dimension set to the given index */ + HOSTDEVICE + Dim(int64_t idx) : head(idx), tail(idx) {} + + HOSTDEVICE + bool operator==(const Dim& o) const { + return (head == o.head) && (tail == o.tail); + } + + HOSTDEVICE + bool operator!=(const Dim& o) const { return !(*this == o); } + + HOSTDEVICE + int64_t& operator[](int idx); + HOSTDEVICE + int64_t operator[](int idx) const; + + HOST std::string to_string() const; + + int64_t head; + Dim tail; +}; + +// Base case specialization +template <> +struct Dim<1> { + static constexpr int dimensions = 1; + + HOSTDEVICE + Dim(int64_t _head) : head(_head) {} + + HOSTDEVICE + Dim() : head(0) {} + + HOSTDEVICE + Dim(int idx, const Dim<1>& size) : head(idx) { +#ifndef __CUDA_ARCH__ + if (idx >= size.head) { + throw std::invalid_argument("Index out of range."); + } +#else + PADDLE_ASSERT(idx < size.head); +#endif + } + + HOSTDEVICE + bool operator==(const Dim<1>& o) const { return (head == o.head); } + + HOSTDEVICE + bool operator!=(const Dim<1>& o) const { return !(*this == o); } + + HOSTDEVICE + int64_t& operator[](int idx); + HOSTDEVICE + int64_t operator[](int idx) const; + + int64_t head; +}; + +namespace { + +// Helper for accessing Dim classes +template +struct DimGetter { + // Return a copy if Dim is const + template + HOSTDEVICE static int64_t impl(const D& d) { + return DimGetter::impl(d.tail); + } + // Return a reference if Dim is mutable + template + HOSTDEVICE static int64_t& impl(D& d) { + return DimGetter::impl(d.tail); + } +}; + +// Eureka! We found the element! +template <> +struct DimGetter<0> { + // Return a copy if Dim is const + template + HOSTDEVICE static int64_t impl(const D& d) { + return d.head; + } + // Return a reference if Dim is mutable + template + HOSTDEVICE static int64_t& impl(D& d) { + return d.head; + } +}; + +template +HOSTDEVICE int64_t& indexer(Dim& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx < 0) { + throw std::invalid_argument("Tried to access a negative dimension"); + } +#else + PADDLE_ASSERT(idx >= 0); +#endif + if (idx == 0) { + return dim.head; + } + return indexer(dim.tail, idx - 1); +} + +template <> +HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx != 0) { + throw std::invalid_argument("Invalid index"); + } +#else + PADDLE_ASSERT(idx == 0); +#endif + return dim.head; +} + +template +HOSTDEVICE int64_t indexer(const Dim& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx < 0) { + throw std::invalid_argument("Tried to access a negative dimension"); + } +#else + PADDLE_ASSERT(idx >= 0); +#endif + if (idx == 0) { + return dim.head; + } + return indexer(dim.tail, idx - 1); +} + +template <> +HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx != 0) { + throw std::invalid_argument("Invalid index"); + } +#else + PADDLE_ASSERT(idx == 0); +#endif + return dim.head; +} + +} // namespace +// Static access to constant Dim +template +HOSTDEVICE int64_t get(const Dim& d) { + return DimGetter::impl(d); +} + +// Static access to mutable Dim +template +HOSTDEVICE int64_t& get(Dim& d) { + return DimGetter::impl(d); +} + +// Dynamic access to constant Dim +template +HOSTDEVICE int64_t Dim::operator[](int i) const { + return indexer(*this, i); +} + +// Dynamic access to mutable Dim +template +HOSTDEVICE int64_t& Dim::operator[](int i) { + return indexer(*this, i); +} + +// Dynamic access to constant Dim +inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const { + return indexer(*this, i); +} + +// Dynamic access to mutable Dim +inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) { + return indexer(*this, i); +} + +// Dynamic access to constant Dim +// without std::enable_if will try to instantiate this on get<0>(d) +template +HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, + int i) { + return d[i]; +} + +// Dynamic access to mutable Dim +template +HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, + int i) { + return d[i]; +} + +// Dot product of two dims +template +HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) { + return a.head * b.head + linearize(a.tail, b.tail); +} + +// Base case dot product of two Dims +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) { + return a.head * b.head; +} + +// Product of a Dim +template +HOSTDEVICE int64_t product(const Dim& a, int prod = 1) { + return prod * a.head * product(a.tail); +} + +// Base case product of a Dim +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) { + return prod * a.head; +} + +// Is 0 <= idx_i < size_i for all i? +template +HOSTDEVICE bool contained(const Dim& idx, const Dim& size) { + return ((0 <= idx.head) && (idx.head < size.head) && + contained(idx.tail, size.tail)); +} + +// Base case of is 0 <= idx_i < size_i ? +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) { + return ((0 <= idx.head) && (idx.head < size.head)); +} + +/** + * \brief Compute exclusive prefix-multiply of a Dim. + */ +template +HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) { + return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); +} + +///\cond HIDDEN +// Base case of ex_prefix_mul +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) { + return Dim<1>(mul); +} +///\endcond + +/** + * Add two dimensions together + */ +template +HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) { + return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); +} + +// Base case +template <> +HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) { + return Dim<1>(a.head + b.head); +} + +template +HOSTDEVICE Dim operator+(const Dim& lhs, const Dim& rhs) { + return dim_plus(lhs, rhs); +} + +/** + * Multiply two dimensions together + */ +template +HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) { + return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); +} + +// Base case +template <> +HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) { + return Dim<1>(a.head * b.head); +} + +template +HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { + return dim_mult(lhs, rhs); +} + +/** + * \brief Normalize strides to ensure any dimension with extent 1 + * has stride 0. + * + * \param size Dim object containing the size of an array + * \param stride Dim object containing stride of an array + * \return Dim object the same size as \p size with normalized strides + * + */ + +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + int norm_stride = size.head == 1 ? 0 : stride.head; + return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); +} + +///\cond HIDDEN + +template <> +HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size, + const Dim<1>& stride) { + int norm_stride = size.head == 1 ? 0 : stride.head; + return Dim<1>(norm_stride); +} + +///\endcond + +/** + * Helper function to create a Dim + * + * \param idxes The type of Dim constructed depends on the number of params + * + */ + +template +HOSTDEVICE Dim make_dim(Args... idxes) { + return Dim(idxes...); +} + +// Allows us to output a Dim +// XXX For some reason, overloading fails to resolve this correctly +template +typename std::enable_if<(i > 1), std::ostream&>::type operator<<( + std::ostream& os, const Dim& d) { + os << d.head << ", " << d.tail; + return os; +} + +// Base case that allows us to output a Dim +// XXX I wish this could be an overload instead of a template +template +typename std::enable_if<(i == 1), std::ostream&>::type operator<<( + std::ostream& os, const Dim& d) { + os << d.head; + return os; +} + +template +HOST std::string Dim::to_string() const { + std::stringstream stream; + + stream << *this; + + return stream.str(); +} + +template +HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { + Dim result; + + for (int i = 0; i < D - 1; ++i) { + result[i] = linear_index % extents[i]; + linear_index /= extents[i]; + } + + result[D - 1] = linear_index; + + return result; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f1969d79775ba70661806d589ae3de2696b77e8 --- /dev/null +++ b/paddle/fluid/framework/dim_test.cu @@ -0,0 +1,114 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/dim.h" + +__global__ void test(paddle::framework::Dim<2>* o) { + o[0] = paddle::framework::make_dim(5, 6); +} + +__global__ void dyn_idx_gpu(int64_t* o) { + auto d = paddle::framework::make_dim(5, 6); + o[0] = d[1]; +} + +TEST(Dim, Equality) { + // construct a Dim on the CPU + auto a = paddle::framework::make_dim(3, 4); + EXPECT_EQ(paddle::framework::get<0>(a), 3); + EXPECT_EQ(paddle::framework::get<1>(a), 4); + + // construct a Dim on the GPU + thrust::device_vector> t(2); + test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); + a = t[0]; + EXPECT_EQ(paddle::framework::get<0>(a), 5); + EXPECT_EQ(paddle::framework::get<1>(a), 6); + + // linearization + auto b = paddle::framework::make_dim(7, 8); + EXPECT_EQ(paddle::framework::linearize(a, b), 83); + + // product + EXPECT_EQ(paddle::framework::product(a), 30); + + // mutate a Dim + paddle::framework::get<1>(b) = 10; + EXPECT_EQ(paddle::framework::get<0>(b), 7); + EXPECT_EQ(paddle::framework::get<1>(b), 10); + + // dynamic access + paddle::framework::get(b, 0) = 8; + b[1] = 11; + EXPECT_EQ(paddle::framework::get<0>(b), 8); + EXPECT_EQ(paddle::framework::get<1>(b), 11); + EXPECT_EQ(paddle::framework::get(b, 0), 8); + EXPECT_EQ(b[1], 11); + + // dynamic access on GPU + thrust::device_vector r(1); + dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); + int64_t res = r[0]; + EXPECT_EQ(res, 6); + + // ex_prefix_mul + paddle::framework::Dim<3> c = + paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5)); + EXPECT_EQ(paddle::framework::get<0>(c), 1); + EXPECT_EQ(paddle::framework::get<1>(c), 3); + EXPECT_EQ(paddle::framework::get<2>(c), 12); + + // generate from an index + auto size = paddle::framework::make_dim(4, 5, 2); + c = paddle::framework::Dim<3>(14, size); + EXPECT_EQ(paddle::framework::get<0>(c), 2); + EXPECT_EQ(paddle::framework::get<1>(c), 3); + EXPECT_EQ(paddle::framework::get<2>(c), 0); + c = paddle::framework::Dim<3>(25, size); + EXPECT_EQ(paddle::framework::get<0>(c), 1); + EXPECT_EQ(paddle::framework::get<1>(c), 1); + EXPECT_EQ(paddle::framework::get<2>(c), 1); +} + +TEST(Dim, Bool) { + auto a = paddle::framework::make_dim(3, 4); + auto b = paddle::framework::make_dim(5, 6); + auto c = paddle::framework::make_dim(3, 4); + + // in_bounds check + EXPECT_TRUE(paddle::framework::contained(a, b)); + EXPECT_FALSE(paddle::framework::contained(b, a)); + + // comparison + EXPECT_TRUE(a == a); + EXPECT_FALSE(a == b); + EXPECT_TRUE(a == c); +} + +TEST(Dim, Print) { + { + std::stringstream ss; + auto a = paddle::framework::make_dim(2, 3); + ss << a; + EXPECT_EQ(ss.str(), "2, 3"); + } + { + std::stringstream ss; + ss << paddle::framework::make_dim(8); + EXPECT_EQ(ss.str(), "8"); + } +} diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h new file mode 100644 index 0000000000000000000000000000000000000000..d1b8c701a7941813c0fbf441b8a6c7f4d3811a6d --- /dev/null +++ b/paddle/fluid/framework/eigen.h @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace framework { + +// EigenDim converts paddle::platform::DDim into Eigen::DSizes. +template +struct EigenDim { + using Type = Eigen::DSizes; + + static Type From(const DDim& dims) { + PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)"); + Type ret; + for (int64_t d = 0; d < arity(dims); d++) { + ret[d] = dims[d]; + } + return ret; + } +}; + +// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor. +template +struct EigenTensor { + // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on + // the speed of aligned and unaligned version in future. + using Type = Eigen::TensorMap>; + + using ConstType = + Eigen::TensorMap>; + + static Type From(Tensor& tensor, DDim dims) { + return Type(tensor.data(), EigenDim::From(dims)); + } + + static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); } + + static ConstType From(const Tensor& tensor, DDim dims) { + return ConstType(tensor.data(), EigenDim::From(dims)); + } + + static ConstType From(const Tensor& tensor) { + return From(tensor, tensor.dims_); + } +}; + +template +struct EigenMatrix : public EigenTensor { + static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } + + static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, + int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } +}; + +template +struct EigenVector : public EigenTensor { + // Flatten reshapes a Tensor into an EigenVector. + static typename EigenVector::Type Flatten(Tensor& tensor) { + return EigenVector::From(tensor, {product(tensor.dims_)}); + } + + static typename EigenVector::ConstType Flatten(const Tensor& tensor) { + return EigenVector::From(tensor, {product(tensor.dims_)}); + } +}; + +template +struct EigenScalar { + // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. + using Type = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + using ConstType = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + + static Type From(Tensor& tensor) { return Type(tensor.data()); } + + static ConstType From(const Tensor& tensor) { + return ConstType(tensor.data()); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9e3abeccb34fd1778cac6918d24e30021b433e9 --- /dev/null +++ b/paddle/fluid/framework/eigen_test.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/eigen.h" +#include + +namespace paddle { +namespace framework { + +TEST(EigenDim, From) { + EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3})); + ASSERT_EQ(1, ed[0]); + ASSERT_EQ(2, ed[1]); + ASSERT_EQ(3, ed[2]); +} + +TEST(Eigen, Tensor) { + Tensor t; + float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); + for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenTensor::Type et = EigenTensor::From(t); + + ASSERT_EQ(1, et.dimension(0)); + ASSERT_EQ(2, et.dimension(1)); + ASSERT_EQ(3, et.dimension(2)); + + for (int i = 0; i < 1; i++) { + for (int j = 0; j < 2; j++) { + for (int k = 0; k < 3; k++) { + ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f); + } + } + } +} + +TEST(Eigen, ScalarFrom) { + Tensor t; + int* p = t.mutable_data(make_ddim({1}), platform::CPUPlace()); + *p = static_cast(100); + + EigenScalar::Type es = EigenScalar::From(t); + + ASSERT_EQ(0, es.dimension(0)); + ASSERT_EQ(100, es(0)); +} + +TEST(Eigen, VectorFrom) { + Tensor t; + float* p = t.mutable_data(make_ddim({6}), platform::CPUPlace()); + for (int i = 0; i < 6; i++) { + p[i] = static_cast(i); + } + + EigenVector::Type ev = EigenVector::From(t); + + ASSERT_EQ(6, ev.dimension(0)); + + for (int i = 0; i < 6; i++) { + ASSERT_NEAR(i, ev(i), 1e-6f); + } +} + +TEST(Eigen, VectorFlatten) { + Tensor t; + float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); + for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenVector::Type ev = EigenVector::Flatten(t); + + ASSERT_EQ(1 * 2 * 3, ev.dimension(0)); + + for (int i = 0; i < 1 * 2 * 3; i++) { + ASSERT_NEAR(i, ev(i), 1e-6f); + } +} + +TEST(Eigen, Matrix) { + Tensor t; + float* p = t.mutable_data(make_ddim({2, 3}), platform::CPUPlace()); + for (int i = 0; i < 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::From(t); + + ASSERT_EQ(2, em.dimension(0)); + ASSERT_EQ(3, em.dimension(1)); + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 3; j++) { + ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f); + } + } +} + +TEST(Eigen, MatrixReshape) { + Tensor t; + float* p = t.mutable_data({2, 3, 6, 4}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 6 * 4; ++i) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::Reshape(t, 2); + + ASSERT_EQ(2 * 3, em.dimension(0)); + ASSERT_EQ(6 * 4, em.dimension(1)); + + for (int i = 0; i < 2 * 3; i++) { + for (int j = 0; j < 6 * 4; j++) { + ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..816ad8d6590a1af3a043cfef5da1edee5119575d --- /dev/null +++ b/paddle/fluid/framework/executor.cc @@ -0,0 +1,313 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/executor.h" + +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(benchmark); +DEFINE_bool(check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); + +namespace paddle { +namespace framework { + +Executor::Executor(const platform::Place& place) : place_(place) {} + +static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { + if (var_type == proto::VarDesc::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::READER) { + var->GetMutable(); + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER]", + var_type); + } +} + +static void CheckTensorNANOrInf(const std::string& name, + const framework::Tensor& tensor) { + if (tensor.memory_size() == 0) { + return; + } + if (tensor.type().hash_code() != typeid(float).hash_code() && + tensor.type().hash_code() != typeid(double).hash_code()) { + return; + } + PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); + PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name); +} + +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, + bool create_local_scope, bool create_vars) { + // TODO(tonyyang-svail): + // - only runs on the first device (i.e. no interdevice communication) + // - will change to use multiple blocks for RNN op and Cond Op + PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); + auto& block = pdesc.Block(block_id); + + Scope* local_scope = scope; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : block.AllVars()) { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } // if (create_local_scope) + } // if (create_vars) + + for (auto& op_desc : block.AllOps()) { + auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + VLOG(4) << op->DebugStringEx(local_scope); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(op->Type(), pool.Get(place_)); + + op->Run(*local_scope, place_); + VLOG(3) << op->DebugStringEx(local_scope); + if (FLAGS_benchmark) { + VLOG(2) << "Memory used after operator " + op->Type() + " running: " + << memory::memory_usage(place_); + } + if (FLAGS_check_nan_inf) { + for (auto& vname : op->OutputVars(true)) { + auto* var = local_scope->FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } + } + } + } + if (create_vars && create_local_scope) { + scope->DeleteScope(local_scope); + } + if (FLAGS_benchmark) { + VLOG(2) << "-------------------------------------------------------"; + VLOG(2) << "Memory used after deleting local scope: " + << memory::memory_usage(place_); + VLOG(2) << "-------------------------------------------------------"; + } +} + +// Check whether the block already has feed operators and feed_holder. +// Return false if the block does not have any feed operators. +// If some feed operators have been prepended to the block, check that +// the info contained in these feed operators matches the feed_targets +// and feed_holder_name. Raise exception when any mismatch is found. +// Return true if the block has feed operators and holder of matching info. +static bool has_feed_operators( + BlockDesc* block, std::map& feed_targets, + const std::string& feed_holder_name) { + size_t feed_count = 0; + for (auto* op : block->AllOps()) { + if (op->Type() == kFeedOpType) { + feed_count++; + PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, + "Input to feed op should be '%s'", feed_holder_name); + std::string feed_target_name = op->Output("Out")[0]; + PADDLE_ENFORCE( + feed_targets.find(feed_target_name) != feed_targets.end(), + "Feed operator output name '%s' cannot be found in 'feed_targets'", + feed_target_name); + } + } + + if (feed_count > 0) { + PADDLE_ENFORCE_EQ( + feed_count, feed_targets.size(), + "The number of feed operators should match 'feed_targets'"); + + // When feed operator are present, so should be feed_holder + auto var = block->FindVar(feed_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + feed_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH, + "'%s' variable should be 'FEED_MINIBATCH' type", + feed_holder_name); + } + + return feed_count > 0; +} + +// Check whether the block already has fetch operators and fetch_holder. +// Return false if the block does not have any fetch operators. +// If some fetch operators have been appended to the block, check that +// the info contained in these fetch operators matches the fetch_targets +// and fetch_holder_name. Raise exception when any mismatch is found. +// Return true if the block has fetch operators and holder of matching info. +static bool has_fetch_operators( + BlockDesc* block, std::map& fetch_targets, + const std::string& fetch_holder_name) { + size_t fetch_count = 0; + for (auto* op : block->AllOps()) { + if (op->Type() == kFetchOpType) { + fetch_count++; + PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, + "Output of fetch op should be '%s'", fetch_holder_name); + std::string fetch_target_name = op->Input("X")[0]; + PADDLE_ENFORCE( + fetch_targets.find(fetch_target_name) != fetch_targets.end(), + "Fetch operator input name '%s' cannot be found in 'fetch_targets'", + fetch_target_name); + } + } + + if (fetch_count > 0) { + PADDLE_ENFORCE_EQ( + fetch_count, fetch_targets.size(), + "The number of fetch operators should match 'fetch_targets'"); + + // When fetch operator are present, so should be fetch_holder + auto var = block->FindVar(fetch_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + fetch_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST, + "'%s' variable should be 'FETCH_LIST' type", + fetch_holder_name); + } + + return fetch_count > 0; +} + +void Executor::Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name, + const std::string& fetch_holder_name) { + auto* copy_program = new ProgramDesc(program); + auto* global_block = copy_program->MutableBlock(0); + + if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) { + // create feed_holder variable + auto* feed_holder = global_block->Var(feed_holder_name); + feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH); + feed_holder->SetPersistable(true); + + int i = 0; + for (auto& feed_target : feed_targets) { + std::string var_name = feed_target.first; + VLOG(3) << "feed target's name: " << var_name; + + // prepend feed op + auto* op = global_block->PrependOp(); + op->SetType(kFeedOpType); + op->SetInput("X", {feed_holder_name}); + op->SetOutput("Out", {var_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + // map the data of feed_targets to feed_holder + for (auto* op : global_block->AllOps()) { + if (op->Type() == kFeedOpType) { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, + idx); + } + } + + if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) { + // create fetch_holder variable + auto* fetch_holder = global_block->Var(fetch_holder_name); + fetch_holder->SetType(proto::VarDesc::FETCH_LIST); + fetch_holder->SetPersistable(true); + + int i = 0; + for (auto& fetch_target : fetch_targets) { + std::string var_name = fetch_target.first; + VLOG(3) << "fetch target's name: " << var_name; + + // append fetch op + auto* op = global_block->AppendOp(); + op->SetType(kFetchOpType); + op->SetInput("X", {var_name}); + op->SetOutput("Out", {fetch_holder_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + Run(*copy_program, scope, 0, true, true); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : global_block->AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + GetFetchVariable(*scope, fetch_holder_name, idx); + } + } + + delete copy_program; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h new file mode 100644 index 0000000000000000000000000000000000000000..893c949939e8db2f5227b940bf721ca7f114db9c --- /dev/null +++ b/paddle/fluid/framework/executor.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +class Executor { + public: + // TODO(dzhwinter) : Do not rely on this function, it will be removed + explicit Executor(const platform::DeviceContext& device) + : Executor(device.GetPlace()) {} + + explicit Executor(const platform::Place& place); + + /* @Brief + * Runtime evaluation of the given ProgramDesc under certain Scope + * + * @param + * ProgramDesc + * Scope + */ + void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, + bool create_vars = true); + + void Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + + private: + const platform::Place place_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9bb17355d9bc1fb7f355a4038a4c5831d3530b1 --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "glog/logging.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = + *(g_feed_value->GetMutable>()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index].ShareDataWith(input); + // set lod + feed_inputs[index].set_lod(input.lod()); +} + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index) { + // Since we want to fetch LodTensor from a variable, the variable must + // be created alreadly. + Variable* g_fetch_value = scope.FindVar(var_name); + PADDLE_ENFORCE(g_fetch_value->IsType(), + "Only %s can be invoked by GetFetchVariable", + typeid(FeedFetchList).name()); + auto& fetch_outputs = *g_fetch_value->GetMutable(); + auto& tensor = fetch_outputs[index]; + VLOG(3) << "Fetch " << var_name << " with index " << index + << " shape= " << tensor.dims(); + PADDLE_ENFORCE_LT(index, fetch_outputs.size()); + return tensor; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h new file mode 100644 index 0000000000000000000000000000000000000000..5355c29047e668d1a2ec141b303dd562158b2bb3 --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index); + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h new file mode 100644 index 0000000000000000000000000000000000000000..4281e36b138f66268e2f1e835d4475676d97839d --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +using FeedFetchType = LoDTensor; +using FeedFetchList = std::vector; + +static const std::string kFeedOpType = "feed"; +static const std::string kFetchOpType = "fetch"; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto new file mode 100644 index 0000000000000000000000000000000000000000..d7be1a7352da56e411396614e33919bb55bc3b0f --- /dev/null +++ b/paddle/fluid/framework/framework.proto @@ -0,0 +1,152 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +option optimize_for = LITE_RUNTIME; +package paddle.framework.proto; + +enum AttrType { + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; + BOOLEAN = 6; + BOOLEANS = 7; + BLOCK = 8; + LONG = 9; +} + +// OpDesc describes an instance of a C++ framework::OperatorBase +// derived class type. +message OpDesc { + + message Attr { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; + optional bool b = 10; + repeated bool bools = 11; + optional int32 block_idx = 12; + optional int64 l = 13; + }; + + message Var { + required string parameter = 1; + repeated string arguments = 2; + }; + + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; + +// OpProto describes a C++ framework::OperatorBase derived class. +message OpProto { + + // VarProto describes the C++ type framework::Variable. + message Var { + required string name = 1; + required string comment = 2; + + optional bool duplicable = 3 [ default = false ]; + optional bool intermediate = 4 [ default = false ]; + optional bool dispensable = 5 [ default = false ]; + } + + // AttrProto describes the C++ type Attribute. + message Attr { + required string name = 1; + required AttrType type = 2; + required string comment = 3; + // If that attribute is generated, it means the Paddle third + // language binding has responsibility to fill that + // attribute. End-User should not set that attribute. + optional bool generated = 4 [ default = false ]; + } + + required string type = 1; + repeated Var inputs = 2; + repeated Var outputs = 3; + repeated Attr attrs = 4; + required string comment = 5; +} + +enum DataType { + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; +} + +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; +} + +message LoDTensorArrayDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; +} + +message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } + +message VarDesc { + enum VarType { + LOD_TENSOR = 1; + SELECTED_ROWS = 2; + FEED_MINIBATCH = 3; + FETCH_LIST = 4; + STEP_SCOPES = 5; + LOD_RANK_TABLE = 6; + LOD_TENSOR_ARRAY = 7; + PLACE_LIST = 8; + READER = 9; + } + required string name = 1; + required VarType type = 2; + optional bool persistable = 3 [ default = false ]; + optional LoDTensorDesc lod_tensor = 4; + optional TensorDesc selected_rows = 5; + optional LoDTensorArrayDesc tensor_array = 6; + optional ReaderDesc reader = 7; +} + +message BlockDesc { + required int32 idx = 1; + required int32 parent_idx = 2; + repeated VarDesc vars = 3; + repeated OpDesc ops = 4; +} + +// Please refer to +// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md +// for more details. +message ProgramDesc { repeated BlockDesc blocks = 1; } diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h new file mode 100644 index 0000000000000000000000000000000000000000..21dd4e885485f88eeeb034ca45c643f9fadf3163 --- /dev/null +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -0,0 +1,195 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +/* + This functor class is responsible for creating the gradient ops for the given + operator fwd_op. After it is called (through operator()), the pairs of + (gradient variable, corresponding input variable of fwd_op) will be added to + grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its + gradient varialbe will be ignored or kEmptyVarName depending on the template + argument DropEmptyIG in the derived classes. + */ +class GradOpDescMakerBase { + public: + explicit GradOpDescMakerBase( + const OpDesc& fwd_op, const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block = std::vector()) + : fwd_op_(fwd_op), + no_grad_set_(no_grad_set), + grad_to_var_(grad_to_var), + grad_block_(grad_block) {} + + virtual ~GradOpDescMakerBase() = default; + virtual std::vector> operator()() const = 0; + + protected: + std::vector InputGrad(const std::string& name, + bool drop_empty_grad = true) const { + std::vector ret_val; + auto var_names = this->Input(name); + ret_val.reserve(var_names.size()); + std::transform(var_names.begin(), var_names.end(), + std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + if (no_grad_set_.count(g_name)) { + return kEmptyVarName; + } else { + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + } + }); + if (!drop_empty_grad) { + return ret_val; + } + PADDLE_ENFORCE_LE(var_names.size(), 1UL, + "BUG from operator developer:" + " for input argument with a list of variables, " + " drop_empty_grad is not allowed because it makes" + " the correspondence bewteen a variable and its gradient" + " ambiguous. Use REGISTER_OP_EX to register the op" + " or call InputGrad(?,false) in GradOpDescMaker." + " Op type %s", + fwd_op_.Type()); + + std::vector dropped_ret_val; + dropped_ret_val.reserve(ret_val.size()); + std::copy_if(ret_val.begin(), ret_val.end(), + std::back_inserter(dropped_ret_val), + [](const std::string& str) { return str != kEmptyVarName; }); + return dropped_ret_val; + } + + std::vector OutputGrad(const std::string& name) const { + std::vector ret_val; + auto onames = this->Output(name); + ret_val.reserve(onames.size()); + std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + }); + return ret_val; + } + + std::vector InputNames() const { + return this->fwd_op_.InputNames(); + } + + std::vector OutputNames() const { + return this->fwd_op_.OutputNames(); + } + + std::vector Input(const std::string& name) const { + return fwd_op_.Input(name); + } + + std::vector Output(const std::string& name) const { + return fwd_op_.Output(name); + } + + const std::unordered_map& Attrs() const { + return fwd_op_.GetAttrMap(); + } + + const Attribute& GetAttr(const std::string& name) const { + auto& map = fwd_op_.GetAttrMap(); + auto it = map.find(name); + PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name); + return it->second; + } + + template + inline const T& Attr(const std::string& name) const { + return boost::get(GetAttr(name)); + } + + std::string ForwardOpType() const { return this->fwd_op_.Type(); } + + private: + const OpDesc& fwd_op_; + const std::unordered_set& no_grad_set_; + std::unordered_map* grad_to_var_; + + protected: + std::vector grad_block_; +}; + +class SingleGradOpDescMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const { + std::vector> retv; + retv.emplace_back(this->Apply()); + return retv; + } + + protected: + virtual std::unique_ptr Apply() const = 0; +}; + +template +class DefaultGradOpDescMaker : public SingleGradOpDescMaker { + public: + using SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + virtual std::unique_ptr Apply() const { + auto* grad = new OpDesc(); + grad->SetType(this->GradOpType()); + + for (auto& input_param : this->InputNames()) { + grad->SetInput(input_param, this->Input(input_param)); + grad->SetOutput(GradVarName(input_param), + this->InputGrad(input_param, DropEmptyIG)); + } + + for (auto& output_param : this->OutputNames()) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param)); + } + + grad->SetAttrMap(this->Attrs()); + + return std::unique_ptr(grad); + } + + virtual std::string GradOpType() const { + return this->ForwardOpType() + "_grad"; + } +}; + +class EmptyGradOpMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + std::vector> operator()() const override { + return {}; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb2d740d8609210064a06cccc5d45c84275e9709 --- /dev/null +++ b/paddle/fluid/framework/init.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include // for strdup +#include +#include +#include + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/string/piece.h" + +namespace paddle { +namespace framework { + +std::once_flag gflags_init_flag; + +void InitGflags(std::vector &argv) { + std::call_once(gflags_init_flag, [&]() { + int argc = argv.size(); + char **arr = new char *[argv.size()]; + std::string line; + for (size_t i = 0; i < argv.size(); i++) { + arr[i] = &argv[i][0]; + line += argv[i]; + line += ' '; + } + google::ParseCommandLineFlags(&argc, &arr, true); + VLOG(1) << "Init commandline: " << line; + }); +} + +void InitDevices() { + /*Init all avaiable devices by default */ + + std::vector places; + places.emplace_back(platform::CPUPlace()); + int count = 0; + +#ifdef PADDLE_WITH_CUDA + try { + count = platform::GetCUDADeviceCount(); + } catch (const std::exception &exp) { + LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; + } +#else + LOG(WARNING) + << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; +#endif + + for (int i = 0; i < count; ++i) { + places.emplace_back(platform::CUDAPlace(i)); + } + + platform::DeviceContextPool::Init(places); +} + +void InitGLOG(const std::string &prog_name) { + // glog will not hold the ARGV[0] inside. + // Use strdup to alloc a new string. + google::InitGoogleLogging(strdup(prog_name.c_str())); + google::InstallFailureSignalHandler(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/init.h b/paddle/fluid/framework/init.h similarity index 100% rename from paddle/framework/init.h rename to paddle/fluid/framework/init.h diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/framework/init_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3018541e27a403e1b6e63a8da9eeb1f67915e9e --- /dev/null +++ b/paddle/fluid/framework/init_test.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/platform/device_context.h" + +TEST(InitDevices, CPU) { + using paddle::framework::InitDevices; + using paddle::platform::DeviceContextPool; + +#ifndef PADDLE_WITH_CUDA + InitDevices(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + ASSERT_EQ(pool.size(), 1U); +#endif +} + +TEST(InitDevices, CUDA) { + using paddle::framework::InitDevices; + using paddle::platform::DeviceContextPool; + +#ifdef PADDLE_WITH_CUDA + int count = paddle::platform::GetCUDADeviceCount(); + InitDevices(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + ASSERT_EQ(pool.size(), 1U + static_cast(count)); +#endif +} diff --git a/paddle/framework/library_type.h b/paddle/fluid/framework/library_type.h similarity index 100% rename from paddle/framework/library_type.h rename to paddle/fluid/framework/library_type.h diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc new file mode 100644 index 0000000000000000000000000000000000000000..31c87492349bff4cd81b101a0e8d44b0516bac46 --- /dev/null +++ b/paddle/fluid/framework/lod_rank_table.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" + +namespace paddle { +namespace framework { +void LoDRankTable::Reset(const LoD& lod, size_t level) { + this->coarse_lod_.clear(); + this->items_.clear(); + PADDLE_ENFORCE(level < lod.size(), + "Cannot rank lod since the level %d is less than lod size %d", + level, lod.size()); + coarse_lod_.reserve(level); + for (size_t i = 0; i < level; ++i) { + coarse_lod_.push_back(lod[i]); + } + auto& vec = lod[level]; + for (size_t i = 0; i < vec.size() - 1; ++i) { + TableItem item; + item.index = i; + item.length = vec[i + 1] - vec[i]; + VLOG(10) << "Add item to rank table " << item.index << " " << item.length; + items_.emplace_back(item); + } + // NOTE(yuyang18): + // + // The time complexity of stable_sort is O(N*log(N)) if additional memory is + // available. It is easy to debug and unit test when using `stable_sort` + // instead of `sort`. Also, the items of a rank table will not be too large. + std::stable_sort(items_.begin(), items_.end(), + [](const TableItem& a, const TableItem& b) { + return a.length > b.length; + }); +} + +} // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table) { + out << "NumOfSequence " << table.items().size() << "\n"; + for (auto& each_item : table.items()) { + out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n"; + } + return out; +} +} // namespace paddle diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h new file mode 100644 index 0000000000000000000000000000000000000000..0eaaf49e4c4d90250b247edf2a8699b8c7c5920d --- /dev/null +++ b/paddle/fluid/framework/lod_rank_table.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { + +// LoD Rank Table stores the `level` of `lod` which is ordered by sequence +// length in descending order. It is useful when implement dynamic RNN and is +// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice +// output operators. +// +// The table item contains two element. The length of sequence and the index of +// sequence in that level. +// +// LoDRankTable also stores the coarse_lod, which is the lod information whose +// level is less than input level, in order to restore the output LoD +// information. +class LoDRankTable { + public: + struct TableItem { + size_t index; + size_t length; + }; + + LoDRankTable() {} + + void Reset(const LoD& lod, size_t level); + + const std::vector& items() const { return this->items_; } + + const LoD& coarse_lod() const { return this->coarse_lod_; } + + size_t level() const { return coarse_lod_.size(); } + + private: + LoD coarse_lod_; + std::vector items_; +}; + +} // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table); + +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..05c67e453d0b8c84aaa8b72ec314153791c73e8f --- /dev/null +++ b/paddle/fluid/framework/lod_tensor.cc @@ -0,0 +1,378 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" + +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +std::ostream &operator<<(std::ostream &os, const LoD &lod) { + os << "{"; + for (auto &v : lod) { + os << "{"; + for (auto &i : v) { + os << i << ","; + } + os << "}"; + } + os << "}"; + + return os; +} + +std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { + PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code()); + + if (!platform::is_cpu_place(t.place())) { + LoDTensor tt; + framework::Copy(t, platform::CPUPlace(), &tt); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(t.place()); + dev_ctx.Wait(); + + os << tt; + return os; + } + + os << "dim: " << t.dims() << "\n"; + os << "lod: " << t.lod() << "\n"; + + // only print first ten elements + int64_t size = t.numel() < 10 ? t.numel() : 10; + for (int64_t i = 0; i < size; ++i) { + os << t.data()[i] << " "; + } + + return os; +} + +std::string LoDToString(const LoD &lod) { + std::ostringstream stream; + stream << lod; + return stream.str(); +} + +LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, + size_t elem_end) { + PADDLE_ENFORCE_LT(level, in.size()); + PADDLE_ENFORCE_LT(elem_end, in[level].size()); + + LoD res; + res.resize(in.size() - level); + // copy the first level + res[0].assign(in[level].begin() + elem_begin, + in[level].begin() + elem_end + 1); + for (size_t lvl = 1; lvl < res.size(); lvl++) { + const auto &in_level = in[level + lvl]; + const auto &above_level = res[lvl - 1]; + auto &out_level = res[lvl]; + out_level.assign(in_level.begin() + above_level.front(), + in_level.begin() + above_level.back() + 1); + } + for (size_t lvl = 0; lvl < res.size(); lvl++) { + // to make the first offset equals 0, all the elements minus the first + // element + size_t front = res[lvl].front(); + for (auto &ele : res[lvl]) { + ele -= front; + } + } + return res; +} + +LoD ToAbsOffset(const LoD &in) { + // the lowest level stores relative offsets + if (in.empty() || in.size() == 1) return in; + LoD result = in; + for (auto level = static_cast(in.size() - 2); level >= 0; level--) { + for (size_t i = 0; i < in[level].size(); ++i) { + size_t index = in[level][i]; + result[level][i] = result[level + 1][index]; + } + } + return result; +} + +bool operator==(const LoD &a, const LoD &b) { + if (a.size() != b.size()) { + return false; + } + + for (size_t i = 0; i < a.size(); i++) { + const auto &a_level = a[i]; + const auto &b_level = b[i]; + if (a_level.size() != b_level.size()) { + return false; + } + for (size_t j = 0; j < a_level.size(); j++) { + if (a_level[j] != b_level[j]) { + return false; + } + } + } + return true; +} + +bool CheckLoD(const LoD &in, int tensor_height) { + if (in.empty()) return true; + for (const auto &level : in) { + // check: there should be more than 2 offsets existing in each level. + if (level.size() < 2) return false; + // check: the first offset(the begin offset) of each level should be 0. + if (level.front() != 0) return false; + // check: all the offsets in a level should be ascending(no same items + // allows). + if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { + if (a < b) return true; + return false; + })) { + LOG(INFO) << "ascending error"; + return false; + } + } + // check: the lowest level's last offset should equals `tensor_height` if + // tensor_height>0. + if (tensor_height > 0 && (size_t)tensor_height != in.back().back()) + return false; + + // check: the higher level's last offset should equals the lower level's + // size-1. + // NOTE LoD store the levels from top to bottom, so the higher level goes + // first. + for (size_t level = 0; level < in.size() - 1; level++) { + if (in[level].back() != in[level + 1].size() - 1) return false; + } + return true; +} + +bool CheckAbsLoD(const LoD &in, int tensor_height) { + if (in.empty()) return true; + for (const auto &level : in) { + // check: all the offsets in a level should be ascending(no same items + // allows). + if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { + if (a < b) return true; + return false; + })) { + return false; + } + + // check: there should be more than 2 offsets existing in each level. + if (level.size() < 2) return false; + + // check: the first offset of each level should be 0, and the last should be + // the same(the height of underlying tensor). + if (level.front() != 0) return false; + if (tensor_height < 0) { + tensor_height = level.back(); + } else if ((size_t)tensor_height != level.back()) { + return false; + } + } + return true; +} + +using LoDAndOffset = std::pair>; +LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, + size_t end_idx, size_t start_level) { + LoD sub_lod; + + for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) { + PADDLE_ENFORCE_LE(start_idx, end_idx); + PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size()); + std::vector level_lens; + for (size_t i = start_idx; i < end_idx; ++i) { + level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]); + } + sub_lod.emplace_back(level_lens); + start_idx = lod[level_idx][start_idx]; + end_idx = lod[level_idx][end_idx]; + } + + return LoDAndOffset{sub_lod, {start_idx, end_idx}}; +} + +void AppendLoD(LoD *lod, const LoD &lod_length) { + PADDLE_ENFORCE( + lod->empty() || lod->size() == lod_length.size(), + "The lod_length should has the same size with the appended lod."); + if (lod->empty()) { + for (size_t i = 0; i < lod_length.size(); ++i) { + lod->emplace_back(1, 0); // size = 1, value = 0; + } + *lod = LoD(lod_length.size(), std::vector({0})); + } + for (size_t i = 0; i < lod->size(); ++i) { + auto &level = (*lod)[i]; + for (size_t len : lod_length[i]) { + level.push_back(level.back() + len); + } + } +} + +void SerializeToStream(std::ostream &os, const LoDTensor &tensor, + const platform::DeviceContext &dev_ctx) { + { // the 1st field, uint32_t version for LoDTensor + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, LoD information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... + auto lod = tensor.lod(); + uint64_t size = lod.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + + for (auto &each : lod) { + size = each.size() * sizeof(framework::LoD::value_type::value_type); + os.write(reinterpret_cast(&size), sizeof(size)); + os.write(reinterpret_cast(each.data()), + static_cast(size)); + } + } + // the 3st field, Tensor + SerializeToStream(os, static_cast(tensor), dev_ctx); +} + +void DeserializeFromStream(std::istream &is, LoDTensor *tensor, + const platform::DeviceContext &dev_ctx) { + { + // the 1st field, unit32_t version for LoDTensor + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, LoD information + uint64_t lod_level; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + } + // the 3st filed, Tensor + DeserializeFromStream(is, static_cast(tensor), dev_ctx); +} + +std::vector LoDTensor::SplitLoDTensor( + const std::vector places) const { + check_memory_size(); + int batch_size = + lod().empty() ? dims()[0] : static_cast(lod()[0].size()) - 1; + size_t result_size = std::min(static_cast(batch_size), places.size()); + size_t remainder = batch_size % places.size(); + + std::vector results; + results.reserve(result_size); + + int step_width = static_cast(batch_size / result_size); + for (size_t i = 0; i < result_size; ++i) { + int begin = static_cast(i * step_width); + int end = static_cast((i + 1) * step_width); + if (i + 1 == places.size()) { // last + end += remainder; + } + + LoDTensor dst; + if (lod().empty()) { + auto src = Slice(begin, end); + auto &dst_place = places[i]; + framework::Copy(src, dst_place, &dst); + } else { + auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0); + + auto &offset = lod_and_offset.second; + auto src = Slice(offset.first, offset.second); + auto &dst_place = places[i]; + framework::Copy(src, dst_place, &dst); + + LoD my_lod; + for (auto &l : lod_and_offset.first) { + std::vector v{0}; + for (auto &ll : l) { + v.push_back(ll + v.back()); + } + my_lod.emplace_back(v); + } + dst.set_lod(my_lod); + } + results.emplace_back(dst); + } + + return results; +} + +void LoDTensor::MergeLoDTensor( + const std::vector &lod_tensors, + platform::Place dst_place) { + PADDLE_ENFORCE(!lod_tensors.empty()); + + framework::DDim new_dim = lod_tensors[0]->dims(); + std::type_index new_type = lod_tensors[0]->type(); + framework::DataLayout new_layout = lod_tensors[0]->layout(); + LoD new_lod = lod_tensors[0]->lod(); + for (size_t i = 1; i < lod_tensors.size(); ++i) { + auto *t = lod_tensors[i]; + PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code()); + PADDLE_ENFORCE_EQ(new_layout, t->layout()); + + PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0], + framework::product(t->dims()) / t->dims()[0]); + new_dim[0] += t->dims()[0]; + + auto &lod = t->lod(); + for (size_t j = 0; j < lod.size(); ++j) { + auto &sub_lod = new_lod[j]; + auto &offset = sub_lod.back(); + for (size_t k = 1; k < lod[j].size(); ++k) { + sub_lod.push_back(lod[j][k] + offset); + } + } + } + Resize(new_dim); + set_layout(new_layout); + set_lod(new_lod); + mutable_data(dst_place, new_type); + + int begin = 0; + for (auto *src : lod_tensors) { + int end = begin + src->dims()[0]; + auto dst = Slice(begin, end); + framework::Copy(*src, dst_place, &dst); + begin = end; + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..1509a9fb1347659f7526c6892f632feb8c84579c --- /dev/null +++ b/paddle/fluid/framework/lod_tensor.h @@ -0,0 +1,213 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +/* + * LoD is short for Level of Details. + * + * - in a level, each element indicates relative offset of the lower level + * - the first element should be 0 and that indicates that this sequence start + * from 0 + * - each sequence's begin and end(no-inclusive) is level[id, id+1] + * + * For example: + * 3-level LoD stores + * + * 0 2 3 + * 0 2 4 7 + * 0 2 5 7 10 12 15 20 + */ +using LoD = std::vector>; + +std::ostream& operator<<(std::ostream& os, const LoD& lod); +std::ostream& operator<<(std::ostream& os, const LoDTensor& t); + +std::string LoDToString(const LoD& lod); + +LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, + size_t elem_end); +/* + * Transform an LoD from relative offsets to absolute offsets. + */ +LoD ToAbsOffset(const LoD& in); + +bool operator==(const LoD& a, const LoD& b); + +/* + * Check whether this lod's format is valid. + * + * ATTENTION: + * - Empty lod is treated as valid. + * + * It will check two things: + * + * 1. all the offsets in a level should be ascending(no same items allows). + * 2. there should be more than 2 offsets existing in each level. + * 3. the higher level's last offset should equals the lower level's size-1. + * 4. the first offset(the begin offset) of each level should be 0. + * 5. the lowest level's last offset should equals `tensor_height` if + * tensor_height>0. + */ + +bool CheckLoD(const LoD& in, int tensor_height = -1); +/* + * Check whether this absolute lod's format is valid. + * + * ATTENTION: + * - Empty lod is treated as valid. + * + * It will check two things: + * 1. all the offsets in a level should be ascending(no same items allows) + * 2. there should be more than 2 offsets existing in each level. + * 3. the first offset of each level should be 0, and the last should be the + * same(the height of underlying tensor) or `tensor_height` if + * tensor_height>0. + */ +bool CheckAbsLoD(const LoD& in, int tensor_height = -1); + +/* + * LoDTensor (Level of details Tensor) + * see https://en.wikipedia.org/wiki/Level_of_details for reference. + */ +class LoDTensor : public Tensor { + public: + LoDTensor() : Tensor() {} + + /* Constructor with place should only be used in pybind */ + explicit LoDTensor(const platform::Place& place) : Tensor(place) {} + + explicit LoDTensor(const LoD& lod) : lod_(lod) {} + + void set_lod(const LoD& lod) { lod_ = lod; } + + const LoD& lod() const { return lod_; } + + LoD* mutable_lod() { return &lod_; } + + /* + * Get the start offset and end offset of an element from LoD. + */ + std::pair lod_element(size_t level, size_t elem) const { + PADDLE_ENFORCE_LT(level, NumLevels()); + PADDLE_ENFORCE_LT(elem, NumElements(level)); + return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]); + } + + /* + * Number of LoDTensor's levels, each level has units of data, for example, + * in the sentence's view, article, paragraph, sentence are 3 levels. + */ + size_t NumLevels() const { return lod_.size(); } + /* + * Number of elements in a level. + */ + size_t NumElements(size_t level = 0) const { + PADDLE_ENFORCE_LT(level, NumLevels()); + // the last offset is the end of last element + return (lod_)[level].size() - 1; + } + + std::vector SplitLoDTensor( + const std::vector places) const; + + void MergeLoDTensor(const std::vector& lod_tensors, + platform::Place place); + + private: + LoD lod_; +}; + +/* + * Expand the `source` to fit the LoD of `lod`. For example, a `source` + * LoDTensor is + * - LoD: [0, 2] + * - tensor: [a0, a1] + * a `lod` is + * - LoD: [0 3 5] + * returns a new LoDTensor + * - [a0 a0 a0 a1 a1] + */ +template +LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, + const platform::Place& place) { + LoD abs_lod = ToAbsOffset(lod); + const auto& lod_level = lod[level]; + size_t num_instances = source.dims()[0]; + + // new tensor + LoDTensor tensor; + tensor.set_lod(lod); + auto dims = source.dims(); + dims[0] = lod_level.back(); + tensor.Resize(dims); + tensor.mutable_data(place); + + PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1); + for (size_t ins = 0; ins < num_instances; ins++) { + for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) { + auto slice = tensor.Slice(elem, elem + 1); + Copy(source.Slice(ins, ins + 1), platform::CPUPlace(), + platform::CPUDeviceContext(), &slice); + } + } + return tensor; +} + +// Get the absolute offset of a lod[start_level][start_idx:end_idx] and +// relative length of details for every levels(i.e., [start_level: ]). +// +// For example, +// lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]] +// start_level = 0 +// start_idx = 1 +// end_idx = 3 +// +// Returns: +// LoD = [[1, 4], [2, 4, 2, 3, 2]] +// pair = {11, 24} +std::pair> GetSubLoDAndAbsoluteOffset( + const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level); + +void AppendLoD(LoD* lod, const LoD& lod_length); + +/* + * Serialize/Desiralize LoDTensor to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const LoDTensor& tensor, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, LoDTensor* tensor, + const platform::DeviceContext& dev_ctx); + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/lod_tensor.md b/paddle/fluid/framework/lod_tensor.md similarity index 100% rename from paddle/framework/lod_tensor.md rename to paddle/fluid/framework/lod_tensor.md diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h new file mode 100644 index 0000000000000000000000000000000000000000..652513bd22597000e8249eb19776182d850793aa --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +using LoDTensorArray = std::vector; +} +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e0ed2495d68a8cc0d377aaf5b5103aea1064688 --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -0,0 +1,228 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" + +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +TEST(LoD, data) { + LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i); + } +} + +TEST(LodExpand, test) { + LoD lod{{0, 2}}; + LoDTensor tensor; + tensor.set_lod(lod); + tensor.Resize({2, 1}); + tensor.mutable_data(platform::CPUPlace()); + tensor.data()[0] = 0; + tensor.data()[1] = 1; + + LoD target; + target.emplace_back(std::vector{0, 3, 5}); + auto new_tensor = LodExpand(tensor, target, 0UL, platform::CPUPlace()); + std::vector result{{0, 0, 0, 1, 1}}; + for (size_t i = 0; i < 5; i++) { + ASSERT_EQ(new_tensor.data()[i], result[i]); + } +} + +TEST(LoD, GetFineGrainedLoDLength) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5})); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + lod.push_back( + std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29})); + + auto lod_and_offset = + paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0); + LoD lod_length = lod_and_offset.first; + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + + LoD expected; + expected.push_back(std::vector{2}); + expected.push_back(std::vector{2, 2}); + expected.push_back(std::vector{2, 3, 4, 2}); + EXPECT_EQ(lod_length, expected); + EXPECT_EQ(start_offset, 15UL); + EXPECT_EQ(end_offset, 26UL); +} + +TEST(LoD, AppendLoD) { + LoD lod_lens; + lod_lens.push_back(std::vector({2})); + lod_lens.push_back(std::vector({2, 2})); + lod_lens.push_back(std::vector({2, 3, 4, 2})); + + LoD origin; + origin.push_back(std::vector({0, 2})); + origin.push_back(std::vector({0, 1, 6})); + origin.push_back(std::vector({0, 2, 5, 7, 10, 12, 15})); + + paddle::framework::AppendLoD(&origin, lod_lens); + + LoD expected; + expected.push_back(std::vector({0, 2, 4})); + expected.push_back(std::vector({0, 1, 6, 8, 10})); + expected.push_back( + std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26})); + EXPECT_EQ(origin, expected); +} + +TEST(LoD, ToAbsOffset) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod); + + LoD expected; + expected.push_back(std::vector({0, 5})); + expected.push_back(std::vector({0, 2, 5})); + expected.push_back(std::vector({0, 2, 4, 5})); + + EXPECT_EQ(abs_lod, expected); +} + +TEST(LoD, SplitLoDTensor) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5, 6})); + lod.push_back(std::vector({0, 1, 6, 8, 13, 15, 20})); + + platform::CPUPlace place; + LoDTensor lod_tensor; + lod_tensor.Resize({20, 1}); + float* dst_ptr = lod_tensor.mutable_data(place); + for (int i = 0; i < lod_tensor.numel(); ++i) { + dst_ptr[i] = i; + } + lod_tensor.set_lod(lod); + + std::vector places{platform::CPUPlace(), + platform::CPUPlace()}; + LoD lod0; + lod0.push_back(std::vector({0, 2, 4})); + lod0.push_back(std::vector({0, 1, 6, 8, 13})); + LoD lod1; + lod1.push_back(std::vector({0, 1, 2})); + lod1.push_back(std::vector({0, 2, 7})); + + auto lods = lod_tensor.SplitLoDTensor(places); + EXPECT_EQ(lods[0].lod(), lod0); + EXPECT_EQ(lods[1].lod(), lod1); +} + +TEST(LoD, MergeLoDTensor) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5, 6})); + lod.push_back(std::vector({0, 1, 6, 8, 13, 15, 20})); + + platform::CPUPlace place; + + LoDTensor lod_tensor0; + LoD lod0; + lod0.push_back(std::vector({0, 2, 4})); + lod0.push_back(std::vector({0, 1, 6, 8, 13})); + lod_tensor0.set_lod(lod0); + + lod_tensor0.Resize({13, 1}); + float* dst_ptr = lod_tensor0.mutable_data(place); + for (int i = 0; i < lod_tensor0.numel(); ++i) { + dst_ptr[i] = i; + } + + LoDTensor lod_tensor1; + LoD lod1; + lod1.push_back(std::vector({0, 1, 2})); + lod1.push_back(std::vector({0, 2, 7})); + lod_tensor1.set_lod(lod1); + lod_tensor1.Resize({7, 1}); + dst_ptr = lod_tensor1.mutable_data(place); + for (int i = 0; i < lod_tensor1.numel(); ++i) { + dst_ptr[i] = i; + } + + std::vector lods{&lod_tensor0, &lod_tensor1}; + + LoDTensor lod_tensor; + lod_tensor.MergeLoDTensor(lods, place); + EXPECT_EQ(lod_tensor.lod(), lod); +} + +TEST(LoD, CheckLoD) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + // check compatible + ASSERT_TRUE(CheckLoD(relative_lod)); + relative_lod[1].back()++; + ASSERT_FALSE(CheckLoD(relative_lod)); + relative_lod[1].back()--; // recover it + + // check empty + LoD empty_lod; + ASSERT_TRUE(CheckLoD(empty_lod)); + + // check less than 2 offsets in a level + LoD some_lod0; + some_lod0.push_back(std::vector({0})); + ASSERT_FALSE(CheckLoD(some_lod0)); + + // check with underlying tensor storage. + ASSERT_TRUE(CheckLoD(relative_lod, 5)); + ASSERT_FALSE(CheckLoD(relative_lod, 9)); +} + +TEST(LoD, CheckAbsLoD) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + auto abs_lod = ToAbsOffset(relative_lod); + + ASSERT_TRUE(CheckAbsLoD(abs_lod)); + + // check less than 2 offsets in a level. + + // check the last item should be compatible with tensor height. + abs_lod.back().back()++; + ASSERT_FALSE(CheckAbsLoD(abs_lod)); + abs_lod.back().back()--; // restore + + // check less than 2 offsets in a lod. + LoD abs_lod0; + abs_lod0.push_back(std::vector({0})); + ASSERT_FALSE(CheckAbsLoD(abs_lod0)); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..4dd7810c1b25cbfeb7d6d79034a97db3f1d67ebb --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/place.h" + +__global__ void test(size_t* a, int size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; + i += blockDim.x * gridDim.x) { + a[i] *= 2; + } +} + +TEST(LoD, data) { + paddle::framework::InitDevices(); + + paddle::framework::LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + paddle::platform::CUDAPlace gpu(0); + test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); + cudaDeviceSynchronize(); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i * 2); + } +} + +TEST(LoDTensor, LoDInGPU) { + paddle::framework::InitDevices(); + + paddle::framework::LoDTensor lod_tensor; + paddle::platform::CUDAPlace place(0); + + paddle::framework::LoD src_lod; + src_lod.push_back(std::vector{0, 2, 4, 6, 8, 10, 12, 14}); + + lod_tensor.Resize({14, 16}); + lod_tensor.mutable_data(place); + + lod_tensor.set_lod(src_lod); + EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL); + EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL); + + auto lod = lod_tensor.lod(); + + test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); + cudaDeviceSynchronize(); + + for (size_t i = 0; i < src_lod[0].size(); ++i) { + EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); + } +} diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..9756754260d46519d181f95e000f39ba92d22ef0 --- /dev/null +++ b/paddle/fluid/framework/mixed_vector.h @@ -0,0 +1,363 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + +#include "glog/logging.h" + +namespace paddle { +namespace framework { + +// Vector implements the std::vector interface, and can get Data or +// MutableData from any place. The data will be synced implicitly inside. +template +class Vector { + public: + using value_type = T; + + // Default ctor. Create empty Vector + Vector() { InitEmpty(); } + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T& value = T()) { + if (count == 0) { + InitEmpty(); + } else { + resize(count); + T* ptr = begin(); + for (size_t i = 0; i < count; ++i) { + ptr[i] = value; + } + } + } + + // Ctor with init_list + Vector(std::initializer_list init) { + if (init.size() == 0) { + InitEmpty(); + } else { + InitByIter(init.size(), init.begin(), init.end()); + } + } + + // implicit cast from std::vector. + template + Vector(const std::vector& dat) { // NOLINT + if (dat.size() == 0) { + InitEmpty(); + } else { + InitByIter(dat.size(), dat.begin(), dat.end()); + } + } + + // Copy ctor + Vector(const Vector& other) { this->operator=(other); } + + // Copy operator + Vector& operator=(const Vector& other) { + if (other.size() != 0) { + this->InitByIter(other.size(), other.begin(), other.end()); + } else { + InitEmpty(); + } + return *this; + } + + // Move ctor + Vector(Vector&& other) { + this->size_ = other.size_; + this->flag_ = other.flag_; + if (other.cuda_vec_.memory_size()) { + this->cuda_vec_.ShareDataWith(other.cuda_vec_); + } + if (other.cpu_vec_.memory_size()) { + this->cpu_vec_.ShareDataWith(other.cpu_vec_); + } + } + + // CPU data access method. Mutable. + T& operator[](size_t i) { + MutableCPU(); + return const_cast(cpu_vec_.data())[i]; + } + + // CPU data access method. Immutable. + const T& operator[](size_t i) const { + ImmutableCPU(); + return cpu_vec_.data()[i]; + } + + // std::vector iterator methods. Based on CPU data access method + size_t size() const { return size_; } + + T* begin() { return &this->operator[](0); } + + T* end() { return &this->operator[](size()); } + + T& front() { return *begin(); } + + T& back() { + auto it = end(); + --it; + return *it; + } + + const T* begin() const { return &this->operator[](0); } + const T* end() const { return &this->operator[](size()); } + + const T& back() const { + auto it = end(); + --it; + return *it; + } + + T* data() { return begin(); } + + const T* data() const { return begin(); } + + const T& front() const { return *begin(); } + // end of std::vector iterator methods + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + InitByIter(end - begin, begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + if (size_ + 1 > capacity()) { + reserve((size_ + 1) << 1); + } + *end() = elem; + ++size_; + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + size_t pre_size = size_; + resize(pre_size + (end - begin)); + T* ptr = this->begin() + pre_size; + for (; begin < end; ++begin, ++ptr) { + *ptr = *begin; + } + } + + // resize the vector + void resize(size_t size) { + if (size + 1 < capacity()) { + size_ = size; + } else { + MutableCPU(); + Tensor cpu_tensor; + platform::Place cpu = platform::CPUPlace(); + T* ptr = cpu_tensor.mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + const T* old_ptr = + cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + size_, ptr); + } + size_ = size; + cpu_vec_.ShareDataWith(cpu_tensor); + } + } + + // get cuda ptr. immutable + const T* CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return cuda_vec_.data(); + } + + // get cuda ptr. mutable + T* CUDAMutableData(platform::Place place) { + const T* ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + size_ = 0; + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { + return cpu_vec_.memory_size() / SizeOfType(typeid(T)); + } + + // reserve data + void reserve(size_t size) { + size_t pre_size = size_; + resize(size); + resize(pre_size); + } + + // the unify method to access CPU or CUDA data. immutable. + const T* Data(platform::Place place) const { + if (platform::is_gpu_place(place)) { + return CUDAData(place); + } else { + return data(); + } + } + + // the unify method to access CPU or CUDA data. mutable. + T* MutableData(platform::Place place) { + if (platform::is_gpu_place(place)) { + return CUDAMutableData(place); + } else { + return data(); + } + } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + std::vector result; + result.resize(size()); + std::copy(begin(), end(), result.begin()); + return result; + } + + bool operator==(const Vector& other) const { + if (size() != other.size()) return false; + for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) { + if (*it1 != *it2) { + return false; + } + } + return true; + } + + private: + void InitEmpty() { + size_ = 0; + flag_ = kDataInCPU; + } + + template + void InitByIter(size_t size, Iter begin, Iter end) { + platform::Place cpu = platform::CPUPlace(); + T* ptr = this->cpu_vec_.template mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + for (size_t i = 0; i < size; ++i) { + *ptr++ = *begin++; + } + flag_ = kDataInCPU | kDirty; + size_ = size; + } + + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); + WaitPlace(cuda_vec_.place()); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + Copy(cpu_vec_, boost::get(place), &cuda_vec_); + WaitPlace(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && !(place == cuda_vec_.place())) { + framework::Tensor tmp; + Copy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + cuda_vec_.ShareDataWith(tmp); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + Copy(cpu_vec_, boost::get(place), &cuda_vec_); + WaitPlace(place); + SetFlag(kDataInCUDA); + } else if (!(place == cuda_vec_.place())) { + framework::Tensor tmp; + WaitPlace(cuda_vec_.place()); + Copy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + WaitPlace(place); + cuda_vec_.ShareDataWith(tmp); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void ImmutableCPU() const { + if (IsDirty() && + !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + static void WaitPlace(const platform::Place place) { + if (platform::is_gpu_place(place)) { + platform::DeviceContextPool::Instance() + .Get(boost::get(place)) + ->Wait(); + } + } + + mutable int flag_; + mutable Tensor cpu_vec_; + mutable Tensor cuda_vec_; + size_t size_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..a89064525661af71b22f18f835fd7b111956847b --- /dev/null +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/platform/gpu_info.h" + +template +using vec = paddle::framework::Vector; + +TEST(mixed_vector, CPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + vec tmp2; + tmp2 = tmp; + ASSERT_EQ(tmp2.size(), 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp2[i], i); + ASSERT_EQ(tmp2[i], tmp[i]); + } + int cnt = 0; + for (auto& t : tmp2) { + ASSERT_EQ(t, cnt); + ++cnt; + } +} + +static __global__ void multiply_10(int* ptr) { + for (int i = 0; i < 10; ++i) { + ptr[i] *= 10; + } +} + +cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { + return reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); +} + +TEST(mixed_vector, GPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + paddle::platform::CUDAPlace gpu(0); + + multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); + + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 10); + } +} + +TEST(mixed_vector, MultiGPU) { + if (paddle::platform::GetCUDADeviceCount() < 2) { + LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " + "GPUs in your machine."; + return; + } + + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + paddle::platform::CUDAPlace gpu0(0); + paddle::platform::SetDeviceId(0); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); + paddle::platform::CUDAPlace gpu1(1); + auto* gpu1_ptr = tmp.MutableData(gpu1); + paddle::platform::SetDeviceId(1); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 100); + } +} diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..cbc15e60b83397ed8420bc7a4cd716ef15979554 --- /dev/null +++ b/paddle/fluid/framework/op_desc.cc @@ -0,0 +1,521 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_desc.h" +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/shape_inference.h" + +namespace paddle { +namespace framework { + +class OpDesc; +class BlockDesc; +class CompileTimeInferShapeContext : public InferShapeContext { + public: + CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block); + + bool HasInput(const std::string &name) const override; + + bool HasOutput(const std::string &name) const override; + + bool HasInputs(const std::string &name) const override; + + bool HasOutputs(const std::string &name) const override; + + AttrReader Attrs() const override; + + const std::vector &Inputs( + const std::string &name) const override; + + const std::vector &Outputs( + const std::string &name) const override; + + void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, + size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); + auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); + if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) { + VLOG(3) << "input " << in << " is not LodTensor"; + return; + } + PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR, + "The %d-th output of Output(%s) must be LoDTensor.", j, + out); + out_var->SetLoDLevel(in_var->GetLoDLevel()); + } + + bool IsRuntime() const override; + + protected: + proto::VarDesc::VarType GetVarType(const std::string &name) const override; + + DDim GetDim(const std::string &name) const override; + + void SetDim(const std::string &name, const DDim &dim) override; + + std::vector GetRepeatedDims(const std::string &name) const override; + + void SetRepeatedDims(const std::string &name, + const std::vector &dims) override; + + InferShapeVarPtr GetVarPtr(const std::string &name) override; + + const OpDesc &op_; + const BlockDesc &block_; +}; + +OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) { + desc_.set_type(type); + inputs_ = inputs; + outputs_ = outputs; + attrs_ = attrs; + need_update_ = true; +} + +void OpDesc::CopyFrom(const OpDesc &op_desc) { + desc_.set_type(op_desc.Type()); + inputs_ = op_desc.inputs_; + outputs_ = op_desc.outputs_; + attrs_ = op_desc.attrs_; + need_update_ = true; +} + +OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block) + : desc_(desc), need_update_(false) { + // restore inputs_ + int input_size = desc_.inputs_size(); + for (int i = 0; i < input_size; ++i) { + const proto::OpDesc::Var &var = desc_.inputs(i); + std::vector &args = inputs_[var.parameter()]; + int argu_size = var.arguments_size(); + args.reserve(argu_size); + for (int j = 0; j < argu_size; ++j) { + args.push_back(var.arguments(j)); + } + } + // restore outputs_ + int output_size = desc_.outputs_size(); + for (int i = 0; i < output_size; ++i) { + const proto::OpDesc::Var &var = desc_.outputs(i); + std::vector &args = outputs_[var.parameter()]; + int argu_size = var.arguments_size(); + args.reserve(argu_size); + for (int j = 0; j < argu_size; ++j) { + args.push_back(var.arguments(j)); + } + } + // restore attrs_ + for (const proto::OpDesc::Attr &attr : desc_.attrs()) { + std::string attr_name = attr.name(); + // The sub_block referred to by the BLOCK attr hasn't been added + // to ProgramDesc class yet, we skip setting BLOCK attr here. + if (attr.type() != proto::AttrType::BLOCK) { + attrs_[attr_name] = GetAttrValue(attr); + } + } + this->block_ = block; +} + +proto::OpDesc *OpDesc::Proto() { + Flush(); + return &desc_; +} + +const std::vector &OpDesc::Input(const std::string &name) const { + auto it = inputs_.find(name); + PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name, + Type()); + return it->second; +} + +std::vector OpDesc::InputArgumentNames() const { + std::vector retv; + for (auto &ipt : this->inputs_) { + retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); + } + return retv; +} + +void OpDesc::SetInput(const std::string ¶m_name, + const std::vector &args) { + need_update_ = true; + inputs_[param_name] = args; +} + +const std::vector &OpDesc::Output(const std::string &name) const { + auto it = outputs_.find(name); + PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s", + name, Type()); + return it->second; +} + +std::vector OpDesc::OutputArgumentNames() const { + std::vector retv; + for (auto &ipt : this->outputs_) { + retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); + } + return retv; +} + +void OpDesc::SetOutput(const std::string ¶m_name, + const std::vector &args) { + need_update_ = true; + this->outputs_[param_name] = args; +} + +proto::AttrType OpDesc::GetAttrType(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return static_cast(it->second.which() - 1); +} + +std::vector OpDesc::AttrNames() const { + std::vector retv; + retv.reserve(attrs_.size()); + for (auto &attr : attrs_) { + retv.push_back(attr.first); + } + return retv; +} + +void OpDesc::SetAttr(const std::string &name, const Attribute &v) { + this->attrs_[name] = v; + need_update_ = true; +} + +void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) { + this->attrs_[name] = █ + need_update_ = true; +} + +void OpDesc::SetAttrMap( + const std::unordered_map &attr_map) { + attrs_ = attr_map; + need_update_ = true; +} + +Attribute OpDesc::GetAttr(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return it->second; +} + +int OpDesc::GetBlockAttr(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return boost::get(it->second)->ID(); +} + +const std::unordered_map &OpDesc::GetAttrMap() const { + return attrs_; +} + +void OpDesc::Rename(const std::string &old_name, const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + need_update_ = true; +} + +void OpDesc::RenameOutput(const std::string &old_name, + const std::string &new_name) { + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + need_update_ = true; +} + +void OpDesc::RenameInput(const std::string &old_name, + const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + need_update_ = true; +} + +struct SetAttrDescVisitor : public boost::static_visitor { + explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {} + mutable proto::OpDesc::Attr *attr_; + void operator()(int v) const { attr_->set_i(v); } + void operator()(float v) const { attr_->set_f(v); } + void operator()(const std::string &v) const { attr_->set_s(v); } + + // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162 + template ::value>::type> + void operator()(T b) const { + attr_->set_b(b); + } + + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_ints()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_floats()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_strings()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_bools()); + } + void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } + void operator()(int64_t v) const { attr_->set_l(v); } + void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } +}; + +void OpDesc::Flush() { + if (need_update_) { + this->desc_.mutable_inputs()->Clear(); + for (auto &ipt : inputs_) { + auto *input = desc_.add_inputs(); + input->set_parameter(ipt.first); + VectorToRepeated(ipt.second, input->mutable_arguments()); + } + + this->desc_.mutable_outputs()->Clear(); + for (auto &opt : outputs_) { + auto *output = desc_.add_outputs(); + output->set_parameter(opt.first); + VectorToRepeated(opt.second, output->mutable_arguments()); + } + + this->desc_.mutable_attrs()->Clear(); + for (auto &attr : attrs_) { + auto *attr_desc = desc_.add_attrs(); + attr_desc->set_name(attr.first); + attr_desc->set_type( + static_cast(attr.second.which() - 1)); + SetAttrDescVisitor visitor(attr_desc); + boost::apply_visitor(visitor, attr.second); + } + + need_update_ = false; + } +} + +static std::once_flag init_infer_shape_funcs; + +static void InitInferShapeFuncs() { + std::call_once(init_infer_shape_funcs, [] { + auto &map = OpInfoMap::Instance(); + auto &info_map = *map.mutable_map(); + + for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { + auto op_type = kern_pair.first; + auto &op_info = info_map.at(op_type); + auto op = static_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); + if (op_info.infer_shape_) { // infer_shape has been registered. + continue; + } + op_info.infer_shape_ = [op](InferShapeContext *ctx) { + op->InferShape(ctx); + }; + } + }); +} + +void OpDesc::CheckAttrs() { + PADDLE_ENFORCE(!Type().empty(), + "CheckAttr() can not be called before type is setted."); + auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); + if (checker == nullptr) { + // checker is not configured. That operator could be generated by Paddle, + // not by users. + return; + } + checker->Check(attrs_); +} + +void OpDesc::InferShape(const BlockDesc &block) const { + VLOG(3) << "CompileTime infer shape on " << Type(); + InitInferShapeFuncs(); + auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; + PADDLE_ENFORCE(static_cast(infer_shape), + "%s's infer_shape has not been registered", this->Type()); + CompileTimeInferShapeContext ctx(*this, block); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + auto inames = this->InputArgumentNames(); + sout << " From ["; + std::copy(inames.begin(), inames.end(), + std::ostream_iterator(sout, ", ")); + sout << "] to ["; + auto onames = this->OutputArgumentNames(); + std::copy(onames.begin(), onames.end(), + std::ostream_iterator(sout, ", ")); + sout << "]"; + VLOG(10) << sout.str(); + } + infer_shape(&ctx); +} + +void OpDesc::InferVarType(BlockDesc *block) const { + auto &info = OpInfoMap::Instance().Get(this->Type()); + if (info.infer_var_type_) { + info.infer_var_type_(*this, block); + } else { + // all output type is LoDTensor by default + VLOG(10) << this->Type() + << " has not registered InferVarType. Set output variables to " + "LOD_TENSOR"; + for (auto &out_pair : this->outputs_) { + for (auto &out_var_name : out_pair.second) { + block->FindRecursiveOrCreateVar(out_var_name) + .SetType(proto::VarDesc::LOD_TENSOR); + } + } + } +} + +CompileTimeInferShapeContext::CompileTimeInferShapeContext( + const OpDesc &op, const BlockDesc &block) + : op_(op), block_(block) {} + +bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { + const std::vector &input_names = op_.Input(name); + auto length = input_names.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input(%s) should have only one value, " + "but it have %d now", + name, length); + return block_.HasVarRecursive(input_names[0]); +} + +bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { + const std::vector &output_names = op_.Output(name); + auto length = output_names.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output(%s) should have only one value, " + "but it have %d now", + name, length); + return block_.HasVarRecursive(output_names[0]); +} + +bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const { + const std::vector &input_names = op_.Input(name); + if (input_names.empty()) { + return false; + } + for (auto &input : input_names) { + if (!block_.HasVarRecursive(input)) return false; + } + return true; +} + +bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { + const std::vector &output_names = op_.Output(name); + if (output_names.empty()) { + return false; + } + for (auto &output : output_names) { + if (!block_.HasVarRecursive(output)) return false; + } + return true; +} + +AttrReader CompileTimeInferShapeContext::Attrs() const { + return AttrReader(op_.GetAttrMap()); +} + +const std::vector &CompileTimeInferShapeContext::Inputs( + const std::string &name) const { + return op_.Input(name); +} + +const std::vector &CompileTimeInferShapeContext::Outputs( + const std::string &name) const { + return op_.Output(name); +} + +DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + DDim res; + try { + auto shape = var->GetShape(); + res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); + } catch (...) { + VLOG(5) << "GetDim of variable " << name << " error"; + std::rethrow_exception(std::current_exception()); + } + return res; +} + +std::vector CompileTimeInferShapeContext::GetRepeatedDims( + const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector res; + try { + auto shapes = var->GetShapes(); + for (const auto &s : shapes) { + res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s)); + } + } catch (...) { + VLOG(5) << "GetRepeatedDim of variable " << name << " error."; + std::rethrow_exception(std::current_exception()); + } + return res; +} + +void CompileTimeInferShapeContext::SetDim(const std::string &name, + const DDim &dim) { + block_.FindVarRecursive(name)->SetShape(vectorize(dim)); +} + +void CompileTimeInferShapeContext::SetRepeatedDims( + const std::string &name, const std::vector &dims) { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector> dim_vec(dims.size()); + std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize); + var->SetShapes(dim_vec); +} + +bool CompileTimeInferShapeContext::IsRuntime() const { return false; } + +proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType( + const std::string &name) const { + return block_.FindVarRecursive(name)->GetType(); +} + +InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr( + const std::string &name) { + return block_.FindVarRecursive(name); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..698df829e56e1182e742db926a712497ee2b6966 --- /dev/null +++ b/paddle/fluid/framework/op_desc.h @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { + +class BlockDesc; +class ProgramDesc; +class OpDesc { + public: + OpDesc() {} + + OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs); + + OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block); + + explicit OpDesc(BlockDesc *block) : block_(block) {} + + OpDesc(const OpDesc &other, BlockDesc *block) { + *this = other; + block_ = block; + } + + void CopyFrom(const OpDesc &op_desc); + + proto::OpDesc *Proto(); + + std::string Type() const { return desc_.type(); } + + void SetType(const std::string &type) { desc_.set_type(type); } + + const std::vector &Input(const std::string &name) const; + + std::vector InputArgumentNames() const; + + void SetInput(const std::string ¶m_name, + const std::vector &args); + + const std::vector &Output(const std::string &name) const; + + std::vector OutputArgumentNames() const; + + void SetOutput(const std::string ¶m_name, + const std::vector &args); + + bool HasAttr(const std::string &name) const { + return attrs_.find(name) != attrs_.end(); + } + + proto::AttrType GetAttrType(const std::string &name) const; + + std::vector AttrNames() const; + + void SetAttr(const std::string &name, const Attribute &v); + + void SetBlockAttr(const std::string &name, BlockDesc &block); + + Attribute GetAttr(const std::string &name) const; + + int GetBlockAttr(const std::string &name) const; + + void Rename(const std::string &old_name, const std::string &new_name); + + void RenameOutput(const std::string &old_name, const std::string &new_name); + + void RenameInput(const std::string &old_name, const std::string &new_name); + + // Only be used in C++ + const AttributeMap &GetAttrMap() const; + + // Only be used in C++ + void SetAttrMap(const AttributeMap &attr_map); + + std::vector InputNames() const { return MapKeys(inputs_); } + std::vector OutputNames() const { return MapKeys(outputs_); } + + void SetInputMap(const VariableNameMap &input) { + this->inputs_ = input; + this->need_update_ = true; + } + + void SetOutputMap(const VariableNameMap &output) { + this->outputs_ = output; + this->need_update_ = true; + } + + const VariableNameMap &Inputs() const { return inputs_; } + + const VariableNameMap &Outputs() const { return outputs_; } + + AttributeMap *MutableAttrMap() { + this->need_update_ = true; + return &this->attrs_; + } + + void CheckAttrs(); + + void InferShape(const BlockDesc &block) const; + + void InferVarType(BlockDesc *block) const; + + void MarkAsTarget() { desc_.set_is_target(true); } + + void Flush(); + + BlockDesc *Block() { return this->block_; } + + void SetBlock(BlockDesc *block) { this->block_ = block; } + + private: + template + static std::vector MapKeys(const MapType &map) { + std::vector ret_val; + ret_val.reserve(map.size()); + std::transform( + map.begin(), map.end(), std::back_inserter(ret_val), + [](const typename MapType::value_type &pair) { return pair.first; }); + return ret_val; + } + + proto::OpDesc desc_; + BlockDesc *block_; // not_own + // input arg name => input variable names + VariableNameMap inputs_; + // output arg name => output variable names + VariableNameMap outputs_; + AttributeMap attrs_; + + // need_update_ indicate there some local changes not be synchronized. If + // local changes should be synchronized, need_update_ should be set to true. + bool need_update_{false}; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..703c9c3234b62e80c3f768ddb892584c1c0070c0 --- /dev/null +++ b/paddle/fluid/framework/op_info.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_info.h" + +namespace paddle { +namespace framework { + +static OpInfoMap* g_op_info_map = nullptr; + +OpInfoMap& OpInfoMap::Instance() { + if (g_op_info_map == nullptr) { + g_op_info_map = new OpInfoMap(); + } + return *g_op_info_map; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h new file mode 100644 index 0000000000000000000000000000000000000000..e6b3ff9e653196b9234e02131f37d5964c4f6e84 --- /dev/null +++ b/paddle/fluid/framework/op_info.h @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class InferShapeBase { + public: + virtual ~InferShapeBase() = default; + virtual void operator()(InferShapeContext*) const = 0; +}; + +struct OpInfo { + OpCreator creator_; + GradOpMakerFN grad_op_maker_; + proto::OpProto* proto_{nullptr}; + OpAttrChecker* checker_{nullptr}; + InferVarTypeFN infer_var_type_; + InferShapeFN infer_shape_; + + bool HasOpProtoAndChecker() const { + return proto_ != nullptr && checker_ != nullptr; + } + + const proto::OpProto& Proto() const { + PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); + PADDLE_ENFORCE(proto_->IsInitialized(), + "Operator Proto must be initialized in op info"); + return *proto_; + } + + const OpCreator& Creator() const { + PADDLE_ENFORCE_NOT_NULL(creator_, + "Operator Creator has not been registered"); + return creator_; + } + + const GradOpMakerFN& GradOpMaker() const { + PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, + "Operator GradOpMaker has not been registered."); + return grad_op_maker_; + } + + const OpAttrChecker* Checker() const { return checker_; } +}; + +class OpInfoMap { + public: + static OpInfoMap& Instance(); + + bool Has(const std::string& op_type) const { + return map_.find(op_type) != map_.end(); + } + + void Insert(const std::string& type, const OpInfo& info) { + PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); + map_.insert({type, info}); + } + + const OpInfo& Get(const std::string& type) const { + auto op_info_ptr = GetNullable(type); + PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered", + type); + return *op_info_ptr; + } + + const OpInfo* GetNullable(const std::string& type) const { + auto it = map_.find(type); + if (it == map_.end()) { + return nullptr; + } else { + return &it->second; + } + } + + const std::unordered_map& map() const { return map_; } + + std::unordered_map* mutable_map() { return &map_; } + + private: + OpInfoMap() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(OpInfoMap); +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h new file mode 100644 index 0000000000000000000000000000000000000000..b5dbff26d7edc212a270d4d187dbb868068790c9 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/library_type.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +struct OpKernelType { + struct Hash { + size_t operator()(const OpKernelType& key) const { + int place = key.place_.which(); + int data_type = static_cast(key.data_type_) << LEFT_SHIFT; + int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2); + int library_type = static_cast(key.library_type_) + << (LEFT_SHIFT * 3); + + std::hash hasher; + return hasher(place + data_type + data_layout + library_type); + } + }; + + // place, data_type, library_type kinds less than 2^8 + constexpr static int LEFT_SHIFT = 8; + + proto::DataType data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + + OpKernelType(proto::DataType data_type, platform::Place place, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain) + : data_type_(data_type), + data_layout_(data_layout), + place_(place), + library_type_(library_type) {} + + OpKernelType(proto::DataType data_type, + const platform::DeviceContext& dev_ctx, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain) + : data_type_(data_type), + data_layout_(data_layout), + place_(dev_ctx.GetPlace()), + library_type_(library_type) {} + + bool operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_; + } + + bool operator!=(const OpKernelType& o) const { return !(*this == o); } +}; + +inline std::ostream& operator<<(std::ostream& os, + const OpKernelType& kernel_key) { + os << "data_type[" << kernel_key.data_type_ << "]:data_layout[" + << kernel_key.data_layout_ << "]:place[" << kernel_key.place_ + << "]:library_type[" << kernel_key.library_type_ << "]"; + return os; +} + +inline std::string KernelTypeToString(const OpKernelType& kernel_key) { + std::ostringstream stream; + stream << kernel_key; + return stream.str(); +} + +inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { + return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; +} + +inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { + return (!platform::places_are_same_class(l.place_, r.place_)) || + (l.data_type_ != r.data_type_) || + NeedTransformLayout(l.data_layout_, r.data_layout_); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..64096907df5a52904525ef0bf25bb9527c3a8c4b --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" +#include +#include + +TEST(OpKernelType, ToString) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + + ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), + "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type[" + "CUDNN]"); +} + +TEST(OpKernelType, Hash) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using CUDAPlace = paddle::platform::CUDAPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + + OpKernelType::Hash hasher; + ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); +} diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a779b10b49ab35dd0dbe25ac3f2bccd34fb654e --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { + +void OpProtoAndCheckerMaker::Validate() { + validated_ = true; + CheckNoDuplicatedInOutAttrs(); +} + +OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( + const std::string& name, const std::string& comment) { + auto* input = proto_->add_inputs(); + input->set_name(name); + input->set_comment(comment); + return OpProtoAndCheckerMaker::VariableBuilder{input}; +} + +OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput( + const std::string& name, const std::string& comment) { + auto* output = proto_->add_outputs(); + output->set_name(name); + output->set_comment(comment); + return OpProtoAndCheckerMaker::VariableBuilder{output}; +} + +void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { + std::unordered_set names; + auto checker = [&](const std::string& name) { + PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name); + names.insert(name); + }; + for (auto& attr : proto_->attrs()) { + checker(attr.name()); + } + for (auto& input : proto_->inputs()) { + checker(input.name()); + } + for (auto& output : proto_->outputs()) { + checker(output.name()); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h new file mode 100644 index 0000000000000000000000000000000000000000..1dbfc7d37be6ae79fde39434b12355a54ee648f6 --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace framework { + +// this class not only make proto but also init attribute checkers. +class OpProtoAndCheckerMaker { + public: + using OpProto = proto::OpProto; + using OpAttrChecker = framework::OpAttrChecker; + OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : proto_(proto), op_checker_(op_checker) {} + + virtual ~OpProtoAndCheckerMaker() { + PADDLE_ENFORCE(validated_, "should call Validate after build"); + } + + void Validate(); + + protected: + struct VariableBuilder { + OpProto::Var* var_; + + VariableBuilder& AsDuplicable() { + var_->set_duplicable(true); + return *this; + } + + VariableBuilder& AsIntermediate() { + var_->set_intermediate(true); + return *this; + } + + VariableBuilder& AsDispensable() { + var_->set_dispensable(true); + return *this; + } + }; + + VariableBuilder AddInput(const std::string& name, const std::string& comment); + + VariableBuilder AddOutput(const std::string& name, + const std::string& comment); + + template + TypedAttrChecker& AddAttr(const std::string& name, + const std::string& comment, + bool generated = false) { + auto* attr = proto_->add_attrs(); + attr->set_name(name); + attr->set_comment(comment); + attr->set_generated(generated); + attr->set_type(AttrTypeID()); + return op_checker_->AddAttrChecker(name); + } + + void AddComment(const std::string& comment) { proto_->set_comment(comment); } + + private: + void CheckNoDuplicatedInOutAttrs(); + + OpProto* proto_; + OpAttrChecker* op_checker_; + bool validated_{false}; +}; + +class NOPMaker : public OpProtoAndCheckerMaker { + public: + NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) {} +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cfefee8dbdead9dd0074d954fe7318baae57e8c4 --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_proto_maker.h" + +#include "gtest/gtest.h" + +class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + TestAttrProtoMaker(paddle::framework::proto::OpProto* proto, + paddle::framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("scale", "scale of test op"); + AddAttr("scale", "scale of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedAttr) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} + +class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + TestInOutProtoMaker(paddle::framework::proto::OpProto* proto, + paddle::framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of test op"); + AddInput("input", "input of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedInOut) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..739ec72ebc17e31ab207b0e2260d7f563ceaca6e --- /dev/null +++ b/paddle/fluid/framework/op_registry.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +#include + +namespace paddle { +namespace framework { + +std::unique_ptr OpRegistry::CreateOp( + const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, AttributeMap attrs) { + auto& info = OpInfoMap::Instance().Get(type); + if (info.Checker() != nullptr) { + info.Checker()->Check(attrs); + } + auto op = info.Creator()(type, inputs, outputs, attrs); + return std::unique_ptr(op); +} + +static VariableNameMap ConvertOpDescVarsToVarNameMap( + const google::protobuf::RepeatedPtrField& + op_desc_vars) { + VariableNameMap ret_val; + for (auto& var : op_desc_vars) { + auto& var_names = ret_val[var.parameter()]; + auto& var_names_in_proto = var.arguments(); + var_names.reserve(static_cast(var_names_in_proto.size())); + std::copy(var_names_in_proto.begin(), var_names_in_proto.end(), + std::back_inserter(var_names)); + } + return ret_val; +} + +std::unique_ptr OpRegistry::CreateOp( + const proto::OpDesc& op_desc) { + VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be" + "used in unit tests. Use CreateOp(const OpDesc& op_desc) " + "instead."; + VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs()); + VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs()); + AttributeMap attrs; + for (auto& attr : op_desc.attrs()) { + attrs[attr.name()] = GetAttrValue(attr); + } + + return CreateOp(op_desc.type(), inputs, outputs, attrs); +} + +std::unique_ptr OpRegistry::CreateOp(const OpDesc& op_desc) { + return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(), + op_desc.GetAttrMap()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h new file mode 100644 index 0000000000000000000000000000000000000000..73faa99668ad58ddb66de515eb4750883f58bcf5 --- /dev/null +++ b/paddle/fluid/framework/op_registry.h @@ -0,0 +1,246 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" // For VLOG() +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/details/op_registry.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/shape_inference.h" + +namespace paddle { +namespace framework { +class Registrar { + public: + // In our design, various kinds of classes, e.g., operators and kernels, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_OP macros to + // call this method. So, as long as the callee code calls USE_OP, the global + // registrar variable won't be removed by the linker. + void Touch() {} +}; + +template +struct OperatorRegistrar : public Registrar { + explicit OperatorRegistrar(const char* op_type) { + PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type), + "'%s' is registered more than once.", op_type); + static_assert(sizeof...(ARGS) != 0, + "OperatorRegistrar should be invoked at least by OpClass"); + OpInfo info; + details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info); + OpInfoMap::Instance().Insert(op_type, info); + } +}; + +class OpRegistry { + public: + static std::unique_ptr CreateOp(const std::string& type, + const VariableNameMap& inputs, + const VariableNameMap& outputs, + AttributeMap attrs); + + static std::unique_ptr CreateOp(const proto::OpDesc& op_desc); + + static std::unique_ptr CreateOp(const OpDesc& op_desc); +}; + +template +struct OpKernelRegistrarFunctor; + +template +struct OpKernelRegistrarFunctor { + using KERNEL_TYPE = + typename std::tuple_element>::type; + + void operator()(const char* op_type, const char* library_type) const { + using T = typename KERNEL_TYPE::ELEMENT_TYPE; + OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), + DataLayout::kAnyLayout, StringToLibraryType(library_type)); + OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); + + constexpr auto size = std::tuple_size>::value; + OpKernelRegistrarFunctor + func; + func(op_type, library_type); + } +}; + +template +struct OpKernelRegistrarFunctor { + void operator()(const char* op_type, const char* library_type) const {} +}; + +// User can register many kernel in one place. The data type could be different. +template +class OpKernelRegistrar : public Registrar { + public: + explicit OpKernelRegistrar(const char* op_type, const char* library_type) { + OpKernelRegistrarFunctor func; + func(op_type, library_type); + } +}; + +/** + * check if MACRO is used in GLOBAL NAMESPACE. + */ +#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +/* + The variadic arguments should be class types derived from one of the + following classes: + OpProtoAndCheckerMaker + GradOpDescMakerBase + VarTypeInference + InferShapeBase +*/ +#define REGISTER_OPERATOR(op_type, op_class, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op__##op_type, \ + "REGISTER_OPERATOR must be called in global namespace"); \ + class _OpClass_##op_type##_ : public op_class { \ + public: \ + DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_); \ + DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class); \ + }; \ + static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \ + ##__VA_ARGS__> \ + __op_registrar_##op_type##__(#op_type); \ + int TouchOpRegistrar_##op_type() { \ + __op_registrar_##op_type##__.Touch(); \ + return 0; \ + } + +/** + * Macro to register Operator. When the input is duplicable, you should + * use REGISTER_OP_EX with drop_empty_grad=false instead. + */ +#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class) \ + REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class, true) + +// When an argument is duplicable, we need to use this version. +// Perhaps we can omit DropEmptyIG template parameter and +// only have one version of REGISTER_OP. +#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class, drop_empty_grad) \ + REGISTER_OPERATOR(grad_op_type, grad_op_class); \ + class _GradOpDescMaker_##grad_op_type##_ \ + : public ::paddle::framework::DefaultGradOpDescMaker { \ + using ::paddle::framework::DefaultGradOpDescMaker< \ + drop_empty_grad>::DefaultGradOpDescMaker; \ + \ + protected: \ + virtual std::string GradOpType() const { return #grad_op_type; } \ + }; \ + REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ + op_maker_class); + +#define REGISTER_OP_WITH_KERNEL(op_type, ...) \ + REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \ + ##__VA_ARGS__) + +#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \ + REGISTER_OPERATOR(op_type, op_class, op_maker_class) + +/** + * Macro to register OperatorKernel. + */ +#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + "REGISTER_OP_KERNEL must be called in global namespace"); \ + static ::paddle::framework::OpKernelRegistrar \ + __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ + #LIBRARY_TYPE); \ + int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ + __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ + return 0; \ + } + +#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) + +#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) + +/** + * Macro to mark what Operator and Kernel + * we will use and tell the compiler to + * link them into target. + */ +#define USE_OP_ITSELF(op_type) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_itself_##op_type, \ + "USE_OP_ITSELF must be called in global namespace"); \ + extern int TouchOpRegistrar_##op_type(); \ + static int use_op_itself_##op_type##_ __attribute__((unused)) = \ + TouchOpRegistrar_##op_type() + +#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + "USE_OP_DEVICE_KERNEL must be in global namespace"); \ + extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \ + static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ \ + __attribute__((unused)) = \ + TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() + +// TODO(fengjiayi): The following macros +// seems ugly, do we have better method? + +#ifndef PADDLE_WITH_CUDA +#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU) +#else +#define USE_OP_KERNEL(op_type) \ + USE_OP_DEVICE_KERNEL(op_type, CPU); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) +#endif + +#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); + +#define USE_CPU_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CPU); + +#define USE_CUDA_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) + +#define USE_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_KERNEL(op_type) + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..bfbb2cfc2c57c705cf42c65825edcc6dea08cf41 --- /dev/null +++ b/paddle/fluid/framework/op_registry_test.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace pd = paddle::framework; + +namespace paddle { +namespace framework { + +class CosineOp : public OperatorBase { + public: + using OperatorBase::OperatorBase; + void Run(const Scope& scope, const platform::Place& place) const override {} +}; + +class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of cosine op"); + AddOutput("output", "output of cosine op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is cos op"); + } +}; + +class MyTestOp : public OperatorBase { + public: + using OperatorBase::OperatorBase; + void Run(const Scope& scope, const platform::Place& place) const override {} +}; + +class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of cosine op").AsDuplicable(); + AddOutput("output", "output of cosine op").AsIntermediate(); + auto my_checker = [](int i) { + PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!"); + }; + AddAttr("test_attr", "a simple test attribute") + .AddCustomChecker(my_checker); + AddComment("This is my_test op"); + } +}; +} // namespace framework +} // namespace paddle + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + var->add_arguments(arg_name); + } +} +REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp, + paddle::framework::CosineOpProtoAndCheckerMaker); +REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp, + paddle::framework::MyTestOpProtoAndCheckerMaker); + +TEST(OpRegistry, CreateOp) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + float scale = 3.3; + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(scale); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); + float scale_get = op->Attr("scale"); + ASSERT_EQ(scale_get, scale); +} + +TEST(OpRegistry, IllegalAttr) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(-2.0); + + bool caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "larger_than check fail"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); +} + +TEST(OpRegistry, DefaultValue) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + ASSERT_TRUE(op_desc.IsInitialized()); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); + ASSERT_EQ(op->Attr("scale"), 1.0); +} + +TEST(OpRegistry, CustomChecker) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("my_test_op"); + BuildVar("input", {"ii"}, op_desc.add_inputs()); + BuildVar("output", {"oo"}, op_desc.add_outputs()); + + // attr 'test_attr' is not set + bool caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "Attribute 'test_attr' is required!"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + // set 'test_attr' set to an illegal value + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("test_attr"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(3); + caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "'test_attr' must be even!"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + // set 'test_attr' set to a legal value + op_desc.mutable_attrs()->Clear(); + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("test_attr"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(4); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + op->Run(scope, cpu_place); + int test_attr = op->Attr("test_attr"); + ASSERT_EQ(test_attr, 4); +} + +class CosineOpComplete : public paddle::framework::CosineOp { + public: + DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp); + DEFINE_OP_CLONE_METHOD(CosineOpComplete); +}; + +TEST(OperatorRegistrar, Test) { + using namespace paddle::framework; + OperatorRegistrar reg("cos"); +} + +namespace paddle { +namespace framework { + +class OpKernelTestMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment("NoGradOp, same input output. no Grad"); + } +}; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(proto::DataType::FP32, ctx.device_context()); + } +}; + +template +class OpKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const {} +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel, + paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_CPU_KERNEL( + op_with_kernel, + paddle::framework::OpKernelTest); + +REGISTER_OP_CUDA_KERNEL(op_with_kernel, + paddle::framework::OpKernelTest< + paddle::platform::CUDADeviceContext, float>); + +TEST(OperatorRegistrar, CPU) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cpu_place); +} + +TEST(OperatorRegistrar, CUDA) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cuda_place); +} + +static int op_test_value = 0; + +using paddle::platform::DeviceContext; +using paddle::platform::CPUDeviceContext; +using paddle::platform::CUDADeviceContext; + +namespace paddle { +namespace framework { + +class OpWithMultiKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout, + framework::LibraryType::kCUDNN); + } +}; + +template +class OpMultiKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + ++op_test_value; + } +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + --op_test_value; + } +}; + +template +class OpMultiKernelTest2 : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value += 10; + } +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value -= 10; + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel, + paddle::framework::OpWithMultiKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CPU, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest2); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest2); + +TEST(OperatorRegistrar, OpWithMultiKernel) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_multi_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + // TODO(qiao) add priority back + // use all available kernels + op->Run(scope, cuda_place); + EXPECT_EQ(op_test_value, -10); +} diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc new file mode 100644 index 0000000000000000000000000000000000000000..61529fe38b15fe2a4bfa0d64159994d6b62fb086 --- /dev/null +++ b/paddle/fluid/framework/operator.cc @@ -0,0 +1,601 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include + +#include "paddle/fluid/framework/data_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_type.h" + +DECLARE_bool(benchmark); + +namespace paddle { +namespace framework { + +std::vector> kKernelPriority = { + std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), + std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), + std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN), + std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), +}; + +static DDim GetDims(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return DDim({-1}); + } + + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + return DDim({-1}); + } +} + +static LoD GetLoD(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + auto default_lod = LoD({{}}); + + if (var == nullptr) { + return default_lod; + } + + if (var->IsType()) { + return var->Get().lod(); + } else { + return default_lod; + } +} + +std::string OperatorBase::Input(const std::string& name) const { + auto& ins = Inputs(name); + PADDLE_ENFORCE_LE(ins.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + type_, name); + return ins.empty() ? kEmptyVarName : ins[0]; +} + +const std::vector& OperatorBase::Inputs( + const std::string& name) const { + auto it = inputs_.find(name); + PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.", + type_, name); + return it->second; +} + +std::string OperatorBase::Output(const std::string& name) const { + auto& outs = Outputs(name); + PADDLE_ENFORCE_LE(outs.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + type_, name); + return outs.empty() ? kEmptyVarName : outs[0]; +} + +const std::vector& OperatorBase::Outputs( + const std::string& name) const { + auto it = outputs_.find(name); + PADDLE_ENFORCE(it != outputs_.end(), + "Operator %s does not have an output called %s.", type_, name); + return it->second; +} + +std::string OperatorBase::DebugStringEx(const Scope* scope) const { + std::stringstream ss; + ss << "Op(" << type_ << "), inputs:{"; + for (auto it = inputs_.begin(); it != inputs_.end();) { + auto& input = *it; + ss << input.first << "["; + for (size_t i = 0; i < input.second.size(); ++i) { + ss << input.second[i]; + if (scope) { + ss << "[" << GetDims(*scope, input.second[i]) << "]"; + ss << "(" << GetLoD(*scope, input.second[i]) << ")"; + } + if (i != input.second.size() - 1) { + ss << ", "; + } + } + ss << "]"; + ++it; + if (it != inputs_.end()) { + ss << ", "; + } + } + ss << "}, outputs:{"; + for (auto it = outputs_.begin(); it != outputs_.end();) { + auto& output = *it; + ss << output.first << "["; + for (size_t i = 0; i < output.second.size(); ++i) { + ss << output.second[i]; + if (scope) { + ss << "[" << GetDims(*scope, output.second[i]) << "]"; + ss << "(" << GetLoD(*scope, output.second[i]) << ")"; + } + if (i != output.second.size() - 1) { + ss << ", "; + } + } + ss << "]"; + ++it; + if (it != outputs_.end()) { + ss << ", "; + } + } + ss << "}."; + return ss.str(); +} + +void OperatorBase::Rename(const std::string& old_name, + const std::string& new_name) { + for (auto& input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + for (auto& output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } +} + +OperatorBase::OperatorBase(const std::string& type, + const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) + : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) { + GenerateTemporaryNames(); + CheckAllInputOutputSet(); +} + +std::vector OperatorBase::InputVars() const { + std::vector ret_val; + for (auto& o : inputs_) { + ret_val.reserve(ret_val.size() + o.second.size()); + ret_val.insert(ret_val.end(), o.second.begin(), o.second.end()); + } + return ret_val; +} + +std::vector OperatorBase::OutputVars(bool has_intermediate) const { + std::vector ret_val; + if (has_intermediate) { + // push all outputs into ret_val + for (auto& o : outputs_) { + ret_val.reserve(ret_val.size() + o.second.size()); + ret_val.insert(ret_val.end(), o.second.begin(), o.second.end()); + } + return ret_val; + } + auto& info = OpInfoMap::Instance().Get(Type()); + + // get all OpProto::Var for outputs + for (auto& o : info.Proto().outputs()) { + // ignore all intermediate output + if (o.intermediate()) continue; + auto out = outputs_.find(o.name()); + if (out != outputs_.end()) { + ret_val.reserve(ret_val.size() + out->second.size()); + ret_val.insert(ret_val.end(), out->second.begin(), out->second.end()); + } + } + return ret_val; +} + +void OperatorBase::CheckAllInputOutputSet() const { + auto& info_map = OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) return; + + for (auto& in : op_info->Proto().inputs()) { + PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), + "Type %s's input %s is not set", Type(), in.name()); + } + + for (auto& out : op_info->Proto().outputs()) { + PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), + "Type %s's output %s is not set", Type(), out.name()); + } +} + +void OperatorBase::GenerateTemporaryNames() { + static std::atomic gUniqId(0UL); + for (auto& output : outputs_) { + for (auto& output_name : output.second) { + if (output_name == kTempVarName) { + output_name += type_; + output_name += "@"; + output_name += std::to_string(gUniqId.fetch_add(1)); + } + } + } +} + +static bool VarIsTensor(const Variable* var) { + return var->IsType() || var->IsType(); +} + +static const Tensor* GetTensorFromVar(Variable* var) { + if (var->IsType()) { + return var->GetMutable(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + var->Type().name()); + } +} + +static Tensor* GetMutableTensorFromVar(Variable* var) { + if (var->IsType()) { + return var->GetMutable(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + var->Type().name()); + } +} + +template <> +const Tensor* ExecutionContext::Input(const std::string& name) const { + auto* var = InputVar(name); + return var == nullptr ? nullptr + : GetTensorFromVar(const_cast(var)); +} + +template <> +const std::vector ExecutionContext::MultiInput( + const std::string& name) const { + auto names = op().Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr : GetTensorFromVar(var); + }); + return res; +} + +template <> +Tensor* ExecutionContext::Output(const std::string& name) const { + auto var = OutputVar(name); + return var == nullptr ? nullptr : GetMutableTensorFromVar(var); +} + +template <> +std::vector ExecutionContext::MultiOutput( + const std::string& name) const { + auto names = op().Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr + : GetMutableTensorFromVar(var); + }); + return res; +} + +bool OpSupportGPU(const std::string& op_type) { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + if (it == all_kernels.end()) { + // All control operator must support GPU + + return true; + } + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + return false; +} + +class RuntimeInferShapeContext : public InferShapeContext { + public: + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) + : op_(op), scope_(scope) {} + + bool HasInput(const std::string& name) const override { + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto ipt = ins[0]; + auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + return var != nullptr; + } + + bool HasOutput(const std::string& name) const override { + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto ipt = outs[0]; + auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + return var != nullptr; + } + + bool HasInputs(const std::string& name) const override { + auto inputs = op_.Inputs(name); + if (inputs.empty()) { + return false; + } + for (auto& input : inputs) { + if (scope_.FindVar(input) == nullptr) { + return false; + } + } + return true; + } + + bool HasOutputs(const std::string& name) const override { + auto outputs = op_.Outputs(name); + if (outputs.empty()) { + return false; + } + for (auto& output : outputs) { + if (scope_.FindVar(output) == nullptr) { + return false; + } + } + return true; + } + + AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } + + const std::vector& Inputs( + const std::string& name) const override { + return op_.Inputs(name); + } + + const std::vector& Outputs( + const std::string& name) const override { + return op_.Outputs(name); + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); + + // TODO(dzhwinter) : reuse ShareLoD in most operators. + // Need to call ShareLayout explicitly in sequence related ops. + // Shall we have a better method to shared info between in/out Tensor? + out_tensor->set_layout(in_tensor.layout()); + } + + void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_layout(in_tensor.layout()); + } + + bool IsRuntime() const override { return true; } + + protected: + DDim GetDim(const std::string& name) const override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + PADDLE_THROW( + "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + std::vector GetRepeatedDims(const std::string& name) const override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + return var->Get().shapes(); + } else { + PADDLE_THROW( + "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + void SetDim(const std::string& name, const DDim& dim) override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->Resize(dim); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); + } else { + PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", + name, var->Type().name()); + } + } + + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->set_shapes(dims); + } else { + PADDLE_THROW( + "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + proto::VarDesc::VarType GetVarType(const std::string& name) const override { + auto* var = scope_.FindVar(name); + return ToVarType(var->Type()); + } + + InferShapeVarPtr GetVarPtr(const std::string& name) override { + return scope_.FindVar(name); + } + + private: + const OperatorBase& op_; + const Scope& scope_; +}; + +void OperatorWithKernel::Run(const Scope& scope, + const platform::Place& place) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } + + ExecutionContext ctx(*this, scope, *dev_ctx); + + OpKernelMap& kernels = kernels_iter->second; + + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. + + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = this->GetExpectedKernelType(ctx); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } + + // do data transform + Scope& new_scope = scope.NewScope(); + + for (auto& var_name_item : this->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope.FindVar(var_name); + if (var && VarIsTensor(var)) { + auto* tensor_in = GetTensorFromVar(var); + if (tensor_in->IsInitialized()) { + auto kernel_type_for_var = this->GetKernelTypeForVar( + var_name_item.first, *tensor_in, expected_kernel_key); + if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) { + auto out_var_names = OutputVars(true); + if (std::find(out_var_names.begin(), out_var_names.end(), + var_name) != out_var_names.end()) { + PADDLE_THROW( + "var %s is both input and output, " + "does not support transform", + var_name); + } + VLOG(3) << "Transform Variable " << var_name << " from " + << kernel_type_for_var << " to " << expected_kernel_key; + auto* trans_var = new_scope.Var(var_name); + std::shared_ptr out(new Tensor); + DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in, + out.get()); + CopyVariableWithTensor(*var, *(out.get()), *trans_var); + } + } + } + } + } + + auto* new_dev_ctx = pool.Get(expected_kernel_key.place_); + kernel_iter->second->Compute( + ExecutionContext(*this, new_scope, *new_dev_ctx)); + + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + new_dev_ctx->Wait(); + } +} + +proto::DataType OperatorWithKernel::IndicateDataType( + const ExecutionContext& ctx) const { + auto& scope = ctx.scope(); + int data_type = -1; + for (auto& input : this->inputs_) { + for (auto& ipt_name : input.second) { + auto* var = scope.FindVar(ipt_name); + if (var != nullptr) { + const Tensor* t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &(var->Get().value()); + } + if (t != nullptr) { + int tmp = static_cast(ToDataType(t->type())); + PADDLE_ENFORCE(tmp == data_type || data_type == -1, + "DataType of Paddle Op %s must be the same.", Type()); + data_type = tmp; + } + } + } + } + PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); + return static_cast(data_type); +} + +OpKernelType OperatorWithKernel::GetExpectedKernelType( + const ExecutionContext& ctx) const { + return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); +} + +OpKernelType OperatorWithKernel::GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const OpKernelType& expected_kernel_type) const { + return OpKernelType(expected_kernel_type.data_type_, tensor.place()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h new file mode 100644 index 0000000000000000000000000000000000000000..52300abeb7df346d610d2363335dc9d3330ee39e --- /dev/null +++ b/paddle/fluid/framework/operator.h @@ -0,0 +1,401 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" // For VLOG +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/utils/Error.h" + +namespace paddle { +namespace framework { + +/// If a variable is a empty variable, that name will be used. +constexpr char kEmptyVarName[] = "@EMPTY@"; + +/// If a variable is a temporary variable, that name will be set in Python, +/// but it will be convert to a unique name in scope after OpCreator. +constexpr char kTempVarName[] = "@TEMP@"; + +/// If a variable's name has a certain suffix, it means that the +/// variable is the gradient of another varibale. +/// e.g. Variable "x@GRAD" is the gradient of varibale "x". +constexpr char kGradVarSuffix[] = "@GRAD"; + +/// Variables with this suffix are supposed to be filled up with zeros. +constexpr char kZeroVarSuffix[] = "@ZERO"; + +// define some kernel priority +/* Define multiple kernel type fallback order*/ +extern std::vector> kKernelPriority; + +inline std::string GradVarName(const std::string& var_name) { + return var_name + kGradVarSuffix; +} + +class OperatorBase; +class ExecutionContext; + +/** + * OperatorBase has the basic element that Net will call to do computation. + * Only CreateOperator from OpRegistry will new Operator directly. User + * should always construct a proto message OpDesc and call + * OpRegistry::CreateOp(op_desc) to get an Operator instance. + */ +class OperatorBase { + public: + OperatorBase(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs); + + virtual ~OperatorBase() {} + + template + inline const T& Attr(const std::string& name) const { + PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", + name); + return boost::get(attrs_.at(name)); + } + + /// if scope is not null, also show dimensions of arguments + virtual std::string DebugStringEx(const Scope* scope) const; + + std::string DebugString() const { return DebugStringEx(nullptr); } + + /// Net will call this function to Run an op. + virtual void Run(const Scope& scope, const platform::Place& place) const = 0; + + // FIXME(typhoonzero): this is only used for recv_op to stop event_loop. + virtual void Stop() {} + + virtual bool IsNetOp() const { return false; } + + virtual bool SupportGPU() const { return false; } + + /// rename inputs outputs name + void Rename(const std::string& old_name, const std::string& new_name); + + const VariableNameMap& Inputs() const { return inputs_; } + const VariableNameMap& Outputs() const { return outputs_; } + + //! Get a input with argument's name described in `op_proto` + std::string Input(const std::string& name) const; + //! Get a input which has multiple variables. + const std::vector& Inputs(const std::string& name) const; + + std::vector InputVars() const; + + //! Get a output with argument's name described in `op_proto` + std::string Output(const std::string& name) const; + //! Get an output which has multiple variables. + //! TODO add a vector_view to prevent memory copy. + const std::vector& Outputs(const std::string& name) const; + + virtual std::vector OutputVars(bool has_intermediate) const; + + const std::string& Type() const { return type_; } + void SetType(const std::string& type) { type_ = type; } + const AttributeMap& Attrs() const { return attrs_; } + + // Return a new operator instance, which is as same as this. + // Use unique_ptr to prevent caller forget to delete this pointer. + virtual std::unique_ptr Clone() const = 0; + + protected: + std::string type_; + // NOTE: in case of OpGrad, inputs_ contains: + // I (Inputs) + // O (Outputs) + // OG (Output Gradients) + VariableNameMap inputs_; + + // NOTE: in case of OpGrad, outputs_ contains + // IG (Inputs Gradients) + VariableNameMap outputs_; + AttributeMap attrs_; + + private: + void GenerateTemporaryNames(); + void CheckAllInputOutputSet() const; +}; + +// Macro for define a clone method. +// If you are writing an kernel operator, `Clone` will be defined when you +// register it. i.e. `Clone` method is not needed to define by yourself. +#define DEFINE_OP_CLONE_METHOD(cls) \ + std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final { \ + return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \ + } + +// Macro for define a default constructor for Operator. +// You can also use +// using PARENT_CLASS::PARENT_CLASS; +// to use parent's constructor. +#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \ + cls(const std::string& type, \ + const ::paddle::framework::VariableNameMap& inputs, \ + const ::paddle::framework::VariableNameMap& outputs, \ + const paddle::framework::AttributeMap& attrs) \ + : parent_cls(type, inputs, outputs, attrs) {} + +class NOP : public OperatorBase { + public: + using OperatorBase::OperatorBase; + void Run(const Scope& scope, const platform::Place& place) const override {} + std::unique_ptr Clone() const override { + return std::unique_ptr(new NOP(*this)); + } +}; + +class ExecutionContext { + public: + ExecutionContext(const OperatorBase& op, const Scope& scope, + const platform::DeviceContext& device_context) + : op_(op), scope_(scope), device_context_(device_context) {} + + const OperatorBase& op() const { return op_; } + + const Scope& scope() const { return scope_; } + + template + inline const T& Attr(const std::string& name) const { + return op_.Attr(name); + } + + size_t InputSize(const std::string& name) const { + return op_.Inputs(name).size(); + } + + size_t OutputSize(const std::string& name) const { + return op_.Outputs(name).size(); + } + + const Variable* InputVar(const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + } + + Variable* OutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); + } + + const std::vector MultiInputVar( + const std::string& name) const { + auto names = op_.Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [this](const std::string& name) { + return name == kEmptyVarName ? nullptr + : scope_.FindVar(name); + }); + return res; + } + + std::vector MultiOutputVar(const std::string& name) const { + auto names = op_.Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [this](const std::string& name) { + return name == kEmptyVarName ? nullptr + : scope_.FindVar(name); + }); + return res; + } + + template + const T* Input(const std::string& name) const { + auto* var = InputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* Output(const std::string& name) const { + auto var = OutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + template + const std::vector MultiInput(const std::string& name) const { + auto names = op_.Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr : &var->Get(); + }); + return res; + } + + template + std::vector MultiOutput(const std::string& name) const { + auto names = op_.Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr : var->GetMutable(); + }); + return res; + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const { + PADDLE_ENFORCE_LT(i, InputSize(in)); + PADDLE_ENFORCE_LT(j, OutputSize(out)); + auto* in_var = MultiInputVar(in)[i]; + auto* out_var = MultiOutputVar(out)[j]; + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); + } + + platform::Place GetPlace() const { return device_context_.GetPlace(); } + + template + const DeviceContextType& device_context() const { + return *reinterpret_cast(&device_context_); + } + + const platform::DeviceContext& device_context() const { + return device_context_; + } + +#ifdef PADDLE_WITH_CUDA + const inline platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + return *reinterpret_cast( + &device_context_); + } +#endif + + //! Get actual name vector for this input. + const std::vector& Inputs(const std::string& name) const { + return op_.Inputs(name); + } + + //! Get actual name vector for this output. + const std::vector& Outputs(const std::string& name) const { + return op_.Outputs(name); + } + + private: + const OperatorBase& op_; + const Scope& scope_; + const platform::DeviceContext& device_context_; +}; + +template <> +const Tensor* ExecutionContext::Input(const std::string& name) const; + +template <> +const std::vector ExecutionContext::MultiInput( + const std::string& name) const; + +template <> +Tensor* ExecutionContext::Output(const std::string& name) const; + +template <> +std::vector ExecutionContext::MultiOutput( + const std::string& name) const; + +class OpKernelBase { + public: + /** + * ExecutionContext is the only parameter of Kernel Run function. + * Run will get input/output variables, state such as momentum and + * device resource such as CUDA stream, cublas handle, etc. from + * ExecutionContext. User should construct it before run the Operator. + */ + + virtual void Compute(const ExecutionContext& context) const = 0; + + virtual ~OpKernelBase() = default; +}; + +template +class OpKernel : public OpKernelBase { + public: + using ELEMENT_TYPE = T; +}; + +class OperatorWithKernel : public OperatorBase { + public: + using OpKernelMap = + std::unordered_map, + OpKernelType::Hash>; + + OperatorWithKernel(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const Scope& scope, const platform::Place& place) const final; + + static std::unordered_map& + AllOpKernels() { + static std::unordered_map g_all_op_kernels; + return g_all_op_kernels; + } + + bool SupportGPU() const override { + auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); + return std::any_of(op_kernels.begin(), op_kernels.end(), + [](OpKernelMap::const_reference kern_pair) { + return platform::is_gpu_place(kern_pair.first.place_); + }); + } + + virtual void InferShape(InferShapeContext* ctx) const { + OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); + } + + protected: + virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + virtual OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const OpKernelType& expected_kernel_type) const; + + private: + // indicate kernel DataType by input data. Defaultly all input data must be + // same. + proto::DataType IndicateDataType(const ExecutionContext& ctx) const; +}; + +extern bool OpSupportGPU(const std::string& op_type); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b90f5538bb620275521cdc11bf47b4014b2a66e2 --- /dev/null +++ b/paddle/fluid/framework/operator_test.cc @@ -0,0 +1,273 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +static int op_run_num = 0; + +class OpWithoutKernelTest : public OperatorBase { + public: + OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs), x(1) {} + void Run(const Scope& scope, const platform::Place& place) const override { + ++op_run_num; + ASSERT_EQ(static_cast(inputs_.size()), 1); + ASSERT_EQ(static_cast(outputs_.size()), 1); + ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr); + ASSERT_EQ(x, 1); + ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr); + } + + public: + int x{0}; +}; + +class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { + public: + OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of test op"); + AddOutput("output", "output of test op"); + AddAttr("scale", "scale of cosine op"); + AddComment("This is test op"); + } +}; + +} // namespace framework +} // namespace paddle + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +REGISTER_OP_WITHOUT_GRADIENT(test_operator, + paddle::framework::OpWithoutKernelTest, + paddle::framework::OpWithoutKernelCheckerMaker); + +TEST(OperatorBase, all) { + paddle::framework::InitDevices(); + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("test_operator"); + BuildVar("input", {"IN1"}, op_desc.add_inputs()); + BuildVar("output", {"OUT1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + scope.Var("OUT1"); + ASSERT_EQ(paddle::framework::op_run_num, 0); + op->Run(scope, cpu_place); + ASSERT_EQ(paddle::framework::op_run_num, 1); +} + +namespace paddle { +namespace framework { + +class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("x", "input of test op"); + AddOutput("y", "output of test op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is test op"); + } +}; + +static int cpu_kernel_run_num = 0; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + return OpKernelType(proto::DataType::FP32, ctx.GetPlace()); + } +}; + +template +class CPUKernelTest : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + +class OpKernelTestMultiInputsProtoAndCheckerMaker + : public OpProtoAndCheckerMaker { + public: + OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("xs", "inputs of test op").AsDuplicable(); + AddInput("k", "input of test op"); + AddOutput("ys", "outputs of test op").AsDuplicable(); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is test op"); + } +}; + +class CPUKernalMultiInputsTest : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + auto xs = ctx.op().Inputs("xs"); + ASSERT_EQ(xs.size(), 3UL); + ASSERT_EQ(xs[0], "x0"); + ASSERT_EQ(xs[1], "x1"); + ASSERT_EQ(xs[2], "x2"); + + auto inVar0 = ctx.MultiInputVar("xs"); + ASSERT_EQ(inVar0.size(), 3U); + + auto intVar1 = ctx.InputVar("k"); + ASSERT_NE(intVar1, nullptr); + + auto outVar0 = ctx.MultiOutputVar("ys"); + ASSERT_EQ(outVar0.size(), 2U); + + auto inTensor0 = ctx.MultiInput("xs"); + ASSERT_EQ(inTensor0.size(), 3U); + + auto intTensor1 = ctx.Input("k"); + ASSERT_NE(intTensor1, nullptr); + + auto outTensor0 = ctx.MultiOutput("ys"); + ASSERT_EQ(outTensor0.size(), 2U); + + auto k = ctx.op().Input("k"); + ASSERT_EQ(k, "k0"); + + auto ys = ctx.op().Outputs("ys"); + ASSERT_EQ(ys.size(), 2UL); + ASSERT_EQ(ys[0], "y0"); + ASSERT_EQ(ys[1], "y1"); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT( + op_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_with_kernel, + paddle::framework::CPUKernelTest); + +// test with single input +TEST(OpKernel, all) { + paddle::framework::InitDevices(); + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("op_with_kernel"); + BuildVar("x", {"IN1"}, op_desc.add_inputs()); + BuildVar("y", {"OUT1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); + op->Run(scope, cpu_place); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); +} + +REGISTER_OP_WITHOUT_GRADIENT( + op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, + paddle::framework::CPUKernalMultiInputsTest); + +// test with multi inputs +TEST(OpKernel, multi_inputs) { + using namespace paddle::framework; + + paddle::framework::InitDevices(); + proto::OpDesc op_desc; + + op_desc.set_type("op_multi_inputs_with_kernel"); + BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs()); + BuildVar("k", {"k0"}, op_desc.add_inputs()); + BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + scope.Var("x0")->GetMutable(); + scope.Var("x1")->GetMutable(); + scope.Var("x2")->GetMutable(); + scope.Var("k0")->GetMutable(); + scope.Var("y0")->GetMutable(); + scope.Var("y1")->GetMutable(); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + op->Run(scope, cpu_place); +} + +class OperatorClone : public paddle::framework::OperatorBase { + public: + DEFINE_OP_CLONE_METHOD(OperatorClone); + OperatorClone(const std::string& type, + const paddle::framework::VariableNameMap& inputs, + const paddle::framework::VariableNameMap& outputs, + const paddle::framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const paddle::framework::Scope& scope, + const paddle::platform::Place& place) const override {} +}; + +TEST(Operator, Clone) { + paddle::framework::InitDevices(); + OperatorClone a("ABC", paddle::framework::VariableNameMap{}, + paddle::framework::VariableNameMap{}, + paddle::framework::AttributeMap{}); + auto b = a.Clone(); + ASSERT_EQ(a.Type(), b->Type()); +} diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3f2e97cd954bd55ab1a8c9def6938c877a79449 --- /dev/null +++ b/paddle/fluid/framework/program_desc.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace framework { + +BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { + auto *b = desc_.add_blocks(); + b->set_parent_idx(parent.ID()); + b->set_idx(desc_.blocks_size() - 1); + blocks_.emplace_back(new BlockDesc(this, b)); + return blocks_.back().get(); +} + +proto::ProgramDesc *ProgramDesc::Proto() { + for (auto &block : blocks_) { + block->Flush(); + } + return &desc_; +} + +ProgramDesc::ProgramDesc() { + auto *block = desc_.mutable_blocks()->Add(); + block->set_idx(kRootBlockIndex); + block->set_parent_idx(kNoneBlockIndex); + blocks_.emplace_back(new BlockDesc(this, block)); +} + +ProgramDesc::ProgramDesc(const ProgramDesc &o) { + desc_ = o.desc_; + for (int i = 0; i < desc_.blocks_size(); ++i) { + auto *block = desc_.mutable_blocks(i); + blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); + } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx)); + } + } + } + } +} + +ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { + desc_ = desc; + for (auto &block_desc : *desc_.mutable_blocks()) { + blocks_.emplace_back(new BlockDesc(this, &block_desc)); + } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx)); + } + } + } + } +} + +ProgramDesc::ProgramDesc(const std::string &binary_str) { + PADDLE_ENFORCE(desc_.ParseFromString(binary_str), + "Fail to parse program_desc from binary string."); + for (auto &block_desc : *desc_.mutable_blocks()) { + blocks_.emplace_back(new BlockDesc(this, &block_desc)); + } +} + +const std::vector ProgramDesc::GetFeedTargetNames() { + BlockDesc *global_block = blocks_[0].get(); + std::vector feed_target_names; + for (auto *op : global_block->AllOps()) { + if (op->Type() == kFeedOpType) { + feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]); + } + } + return feed_target_names; +} + +const std::vector ProgramDesc::GetFetchTargetNames() { + BlockDesc *global_block = blocks_[0].get(); + std::vector fetch_target_names; + for (auto *op : global_block->AllOps()) { + if (op->Type() == kFetchOpType) { + fetch_target_names.push_back(op->Input("X")[0]); + } + } + return fetch_target_names; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..937de6ba9270a275e5d4e020fe5f2e7f5ef63557 --- /dev/null +++ b/paddle/fluid/framework/program_desc.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class BlockDesc; + +class ProgramDesc { + public: + ProgramDesc(); + + explicit ProgramDesc(const proto::ProgramDesc &desc); + + ProgramDesc(const ProgramDesc &o); + + explicit ProgramDesc(const std::string &binary_str); + + BlockDesc *AppendBlock(const BlockDesc &parent); + + BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); } + + const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; } + + size_t Size() const { return blocks_.size(); } + + proto::ProgramDesc *Proto(); + + const std::vector GetFeedTargetNames(); + const std::vector GetFetchTargetNames(); + + private: + proto::ProgramDesc desc_; + + std::vector> blocks_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..afd5c9dabfbb0dab2832300dedc378ef617d8e81 --- /dev/null +++ b/paddle/fluid/framework/program_desc_test.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/program_desc.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" + +namespace paddle { +namespace framework { +TEST(ProgramDesc, copy_ctor) { + ProgramDesc program; + auto* global_block = program.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(proto::VarDesc_VarType_LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarDesc_VarType_LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(proto::VarDesc_VarType_LOD_TENSOR); + op->SetOutput("Y", {out->Name()}); + + ProgramDesc program_copy(program); + + auto* global_block_copy = program_copy.MutableBlock(0); + ASSERT_NE(global_block, global_block_copy); + + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { + ASSERT_TRUE(global_block_copy->HasVar(name)); + auto* copy = global_block_copy->Var(name); + ASSERT_NE(copy, var_before); + ASSERT_EQ(copy->Name(), var_before->Name()); + ASSERT_EQ(copy->GetType(), var_before->GetType()); + ASSERT_EQ(copy->GetShape(), var_before->GetShape()); + ASSERT_EQ(copy->Proto()->SerializeAsString(), + var_before->Proto()->SerializeAsString()); + }; + + ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames()); + ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size()); + assert_same_var("X", x); + assert_same_var("Y", y); + assert_same_var("Out", out); + + for (size_t i = 0; i < global_block->OpSize(); ++i) { + auto op_origin = global_block->Op(i); + auto op_copy = global_block->Op(i); + + ASSERT_EQ(op_origin->Type(), op_copy->Type()); + ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); + ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); + + ASSERT_EQ(op_copy->Proto()->SerializeAsString(), + op_origin->Proto()->SerializeAsString()); + } + + // Not check block's protostr are same it because the order of vars could be + // different and it is correct. +} + +TEST(ProgramDescBind, serialize_and_deserialize) { + ProgramDesc program_origin; + auto* global_block = program_origin.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(proto::VarDesc_VarType_LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarDesc_VarType_LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(proto::VarDesc_VarType_LOD_TENSOR); + op->SetOutput("Y", {out->Name()}); + + std::string binary_str; + program_origin.Proto()->SerializeToString(&binary_str); + + ProgramDesc program_restored(binary_str); + auto* global_block_restored = program_restored.MutableBlock(0); + ASSERT_NE(global_block, global_block_restored); + + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { + ASSERT_TRUE(global_block_restored->HasVar(name)); + auto* restored = global_block_restored->Var(name); + ASSERT_NE(restored, var_before); + ASSERT_EQ(restored->Name(), var_before->Name()); + ASSERT_EQ(restored->GetType(), var_before->GetType()); + ASSERT_EQ(restored->GetShape(), var_before->GetShape()); + ASSERT_EQ(restored->Proto()->SerializeAsString(), + var_before->Proto()->SerializeAsString()); + }; + + ASSERT_EQ(global_block->LocalVarNames(), + global_block_restored->LocalVarNames()); + ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size()); + assert_same_var("X", x); + assert_same_var("Y", y); + assert_same_var("Out", out); + + for (size_t i = 0; i < global_block->OpSize(); ++i) { + auto op_origin = global_block->Op(i); + auto op_restored = global_block->Op(i); + + ASSERT_EQ(op_origin->Type(), op_restored->Type()); + ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs()); + ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs()); + + ASSERT_EQ(op_restored->Proto()->SerializeAsString(), + op_origin->Proto()->SerializeAsString()); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h similarity index 100% rename from paddle/framework/proto_desc.h rename to paddle/fluid/framework/proto_desc.h diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc new file mode 100644 index 0000000000000000000000000000000000000000..79dbd3bcab4124d3aa765c8ede174c9fb3de689b --- /dev/null +++ b/paddle/fluid/framework/prune.cc @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/prune.h" + +#include +#include +#include +#include +#include + +#include + +namespace paddle { +namespace framework { + +const std::string kFeedOpType = "feed"; +const std::string kFetchOpType = "fetch"; +const std::string kDropOutOpType = "dropout"; +const std::string kBatchNormOpType = "batch_norm"; + +bool HasDependentVar(const proto::OpDesc& op_desc, + const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} + +bool IsTarget(const proto::OpDesc& op_desc) { + if (op_desc.has_is_target()) { + return op_desc.is_target(); + } + return false; +} + +int GetSubBlockIndex(const proto::OpDesc& op_desc) { + for (auto& attr : op_desc.attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + PADDLE_ENFORCE(attr.has_block_idx()); + return attr.block_idx(); + } + } + return -1; +} + +bool HasSubBlock(const proto::OpDesc& op_desc) { + return GetSubBlockIndex(op_desc) > 0; +} + +// block_id is the idx of the current block in the input desc +// parent_block_id is the idx of the parent of the current block +// in the output desc, -1 means the current block is global block +// dependent_vars is passed recursively from the parent block to +// the child block to help pruning +void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, + int block_id, int parent_block_id, + std::set& dependent_vars) { + auto& block = input.blocks(block_id); + auto& ops = block.ops(); + + bool expect_feed = true; + for (auto& op_desc : ops) { + PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed, + "All FeedOps are at the beginning of the ProgramDesc"); + expect_feed = (op_desc.type() == kFeedOpType); + } + + bool expect_fetch = true; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch, + "All FetchOps must at the end of the ProgramDesc"); + expect_fetch = (op_desc.type() == kFetchOpType); + } + + std::vector should_run; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) { + // insert its input to the dependency graph + for (auto& var : op_desc.inputs()) { + for (auto& argu : var.arguments()) { + dependent_vars.insert(argu); + } + } + should_run.push_back(true); + } else { + should_run.push_back(false); + } + } + + // since we are traversing the ProgramDesc in reverse order + // we reverse the should_run vector + std::reverse(should_run.begin(), should_run.end()); + + // copy the current block from input to output + auto* block_field = output->mutable_blocks(); + *block_field->Add() = input.blocks(block_id); + + int output_block_id = output->blocks_size() - 1; + auto* output_block = output->mutable_blocks(output_block_id); + output_block->set_idx(output_block_id); + output_block->set_parent_idx(parent_block_id); + + auto* op_field = output_block->mutable_ops(); + op_field->Clear(); + for (size_t i = 0; i < should_run.size(); ++i) { + if (should_run[i]) { + auto* op = op_field->Add(); + *op = input.blocks(block_id).ops(i); + if (HasSubBlock(*op)) { + // create sub_block_dependent_vars here to help prune the sub block + std::set sub_block_dependent_vars; + for (auto& var : op->inputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + for (auto& var : op->outputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc + // output_block_id is the idx of the current block in the output desc + prune_impl(input, output, GetSubBlockIndex(*op), output_block_id, + sub_block_dependent_vars); + } + } + } + + // remove the VarDescs in BlockDesc that are not referenced in + // the pruned OpDescs + std::unordered_map var_map; + auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars(); + for (const auto& var : *var_field) { + var_map[var.name()] = var; + } + + std::set var_names; + for (const auto& op : *op_field) { + auto& input_field = op.inputs(); + for (auto& input_var : input_field) { + for (auto& arg : input_var.arguments()) { + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } + } + } + auto& output_field = op.outputs(); + for (auto& output_var : output_field) { + for (auto& arg : output_var.arguments()) { + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } + } + } + } + + var_field->Clear(); + for (const auto& name : var_names) { + *var_field->Add() = var_map[name]; + } +} + +// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies +void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { + std::set dependent_vars; + output->clear_blocks(); + prune_impl(input, output, 0, -1, dependent_vars); +} + +void inference_optimize_impl(const proto::ProgramDesc& input, + proto::ProgramDesc* output, int block_id) { + *output = input; + auto* op_field = output->mutable_blocks(block_id)->mutable_ops(); + for (auto& op_desc : *op_field) { + if (op_desc.type() == kDropOutOpType || + op_desc.type() == kBatchNormOpType) { + for (auto& attr : *op_desc.mutable_attrs()) { + if (attr.name() == "is_test") { + attr.set_b(true); + break; + } + } + } + } +} + +void InferenceOptimize(const proto::ProgramDesc& input, + proto::ProgramDesc* output) { + inference_optimize_impl(input, output, 0); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h new file mode 100644 index 0000000000000000000000000000000000000000..601e66b67a77b615e43fe74e72935b1622e59965 --- /dev/null +++ b/paddle/fluid/framework/prune.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output); + +void InferenceOptimize(const proto::ProgramDesc& input, + proto::ProgramDesc* output); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..36b76f0763ec2bab861adf86b60093c5e3c4b9e2 --- /dev/null +++ b/paddle/fluid/framework/prune_test.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/prune.h" + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/net_op.h" + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" + +#include + +namespace f = paddle::framework; +namespace ops = paddle::operators; + +void AddOp(const std::string &type, const f::VariableNameMap &inputs, + const f::VariableNameMap &outputs, f::AttributeMap attrs, + paddle::framework::BlockDesc *block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->Var(v); + var->SetDataType(paddle::framework::proto::DataType::FP32); + } + } + + // insert op + auto op = block->AppendOp(); + op->SetType(type); + for (auto &kv : inputs) { + op->SetInput(kv.first, kv.second); + } + for (auto &kv : outputs) { + op->SetOutput(kv.first, kv.second); + } + op->SetAttrMap(attrs); +} + +TEST(Prune, one_operator) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + f::proto::ProgramDesc pruned; + + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0); + + pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true); + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1); +} + +TEST(Prune, forward) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + + for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) { + f::proto::ProgramDesc pruned; + pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true); + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1); + } +} + +TEST(Prune, multi_input_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{}, + block); + AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, + f::AttributeMap{}, block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4); +} + +TEST(Prune, multi_output_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2); +} + +TEST(Prune, multi_target) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); + pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3); +} diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ef0c4821110a259fd20469e736b93f44a80f90a --- /dev/null +++ b/paddle/fluid/framework/reader.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace framework { + +DDim ReaderBase::shape(size_t idx) const { + PADDLE_ENFORCE_LT( + idx, shapes_.size(), + "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx, + shapes_.size()); + return shapes_[idx]; +} + +void ShuffleReader::ReadNext(std::vector* out) { + if (iteration_pos_ >= buffer_.size()) { + // Reload buffer with new data + buffer_.clear(); + buffer_.reserve(buffer_size_); + for (int i = 0; i < buffer_size_; ++i) { + if (reader_->HasNext()) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + } else { + break; + } + } + // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be + // optimize. + std::random_shuffle(buffer_.begin(), buffer_.end()); + iteration_pos_ = 0; + } + out->clear(); + if (!buffer_.empty()) { + std::swap(*out, buffer_[iteration_pos_++]); + } + // if buffer_ is empty, the 'out' will return as an empty vector. +} + +void BatchReader::ReadNext(std::vector* out) { + buffer_.clear(); + buffer_.reserve(batch_size_); + for (int i = 0; i < batch_size_; ++i) { + if (reader_->HasNext()) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + } else { + break; + } + } + // Concat instances + out->clear(); + if (buffer_.empty()) { + // if buffer_ is empty, the 'out' will return as an empty vector. + return; + } + int out_num = buffer_[0].size(); + out->reserve(out_num); + for (int j = 0; j < out_num; ++j) { + // Merge shape and check date type + std::type_index batch_type = buffer_[0][j].type(); + DDim batch_shape = buffer_[0][j].dims(); + for (size_t i = 1; i < buffer_.size(); ++i) { + std::type_index ins_type = buffer_[i][j].type(); + DDim ins_shape = buffer_[i][j].dims(); + PADDLE_ENFORCE_EQ(batch_type, ins_type); + PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), + slice_ddim(ins_shape, 1, ins_shape.size())); + PADDLE_ENFORCE_GT(ins_shape[0], 0); + batch_shape[0] += ins_shape[0]; + } + + LoDTensor out_tensor; + out_tensor.Resize(batch_shape); + out_tensor.mutable_data(platform::CPUPlace(), batch_type); + int64_t dst_offset = 0; + + // Merge lod and data + LoD batch_lod; + for (size_t i = 0; i < buffer_.size(); ++i) { + DDim ins_shape = buffer_[i][j].dims(); + LoD ins_lod = buffer_[i][j].lod(); + if (i == 0) { + batch_lod = ins_lod; + } else { + PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size()); + for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) { + auto& lod_level = batch_lod[level_idx]; + for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) { + lod_level.push_back(ins_lod[level_idx][k] + lod_level.back()); + } + } + } + Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]); + Copy(buffer_[i][j], platform::CPUPlace(), &dst); + dst_offset += ins_shape[0]; + } + out_tensor.set_lod(batch_lod); + out->push_back(out_tensor); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5eba5fb733b3e9da2b245b4dda18725c9b0895 --- /dev/null +++ b/paddle/fluid/framework/reader.h @@ -0,0 +1,161 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor_array.h" + +namespace paddle { +namespace framework { + +class ReaderBase { + public: + explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) { + PADDLE_ENFORCE(!shapes_.empty()); + } + virtual void ReadNext(std::vector* out) = 0; + virtual bool HasNext() const = 0; + + virtual void ReInit() = 0; + + DDim shape(size_t idx) const; + std::vector shapes() const { return shapes_; } + void set_shapes(const std::vector& shapes) { shapes_ = shapes; } + + virtual ~ReaderBase() {} + + protected: + std::vector shapes_; +}; + +class FileReader : public ReaderBase { + public: + explicit FileReader(const std::vector& shapes) : ReaderBase(shapes) {} +}; + +class DecoratedReader : public ReaderBase { + public: + explicit DecoratedReader(ReaderBase* reader) + : ReaderBase(reader->shapes()), reader_(reader) { + PADDLE_ENFORCE_NOT_NULL(reader_); + } + + bool HasNext() const override { return reader_->HasNext(); } + + void ReInit() override { reader_->ReInit(); } + + protected: + ReaderBase* reader_; +}; + +// file readers + +template +class RandomDataGenerator : public FileReader { + public: + RandomDataGenerator(const std::vector& shapes, float min, float max) + : FileReader(shapes), min_(min), max_(max) { + PADDLE_ENFORCE_LE( + min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); + unsigned int seed = std::random_device()(); + engine_.seed(seed); + dist_ = std::uniform_real_distribution(min_, max_); + } + + void ReadNext(std::vector* out) override { + out->clear(); + out->reserve(shapes_.size()); + for (const DDim& shape : shapes_) { + PADDLE_ENFORCE_GE( + shape.size(), 2, + "The rank of reader's output data should be 2 at least.(Now it's %d)", + shape.size()); + LoDTensor out_tensor; + out_tensor.Resize(shape); + T* data = out_tensor.mutable_data(platform::CPUPlace()); + int64_t numel = product(shape); + for (int64_t i = 0; i < numel; ++i) { + data[i] = dist_(engine_); + } + out->push_back(out_tensor); + } + } + + bool HasNext() const override { return true; } + + void ReInit() override { return; } + + private: + float min_; + float max_; + std::minstd_rand engine_; + std::uniform_real_distribution dist_; +}; + +// decorated readers + +class ShuffleReader : public DecoratedReader { + public: + ShuffleReader(ReaderBase* reader, int buffer_size) + : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { + buffer_.reserve(buffer_size); + } + + void ReadNext(std::vector* out) override; + + private: + int buffer_size_; + std::vector> buffer_; + size_t iteration_pos_; +}; + +class BatchReader : public DecoratedReader { + public: + BatchReader(ReaderBase* reader, int batch_size) + : DecoratedReader(reader), batch_size_(batch_size) { + buffer_.reserve(batch_size_); + } + + void ReadNext(std::vector* out) override; + + private: + int batch_size_; + std::vector> buffer_; +}; + +// The ReaderHolder is used as readers' unified wrapper, +// making it easier to access different type readers in Variables. +class ReaderHolder { + public: + void Reset(ReaderBase* reader) { reader_.reset(reader); } + + ReaderBase* Get() const { return reader_.get(); } + + void ReadNext(std::vector* out) { reader_->ReadNext(out); } + bool HasNext() const { return reader_->HasNext(); } + void ReInit() { reader_->ReInit(); } + + DDim shape(size_t idx) const { return reader_->shape(idx); } + std::vector shapes() const { return reader_->shapes(); } + void set_shapes(const std::vector& shapes) { + reader_->set_shapes(shapes); + } + + private: + std::unique_ptr reader_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc new file mode 100644 index 0000000000000000000000000000000000000000..6006ed16bd4a9aece5772bad58dc75c8b0847206 --- /dev/null +++ b/paddle/fluid/framework/scope.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/scope.h" + +#include // for unique_ptr +#include // for call_once +#include "glog/logging.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/string/printf.h" + +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); + +namespace paddle { +namespace framework { + +Scope::~Scope() { + DropKids(); + for (auto& kv : vars_) { + VLOG(3) << "Destroy variable " << kv.first; + delete kv.second; + } +} + +Scope& Scope::NewScope() const { + kids_.push_back(new Scope(this)); + return *kids_.back(); +} + +Variable* Scope::Var(const std::string& name) { + auto* v = FindVarLocally(name); + if (v != nullptr) return v; + v = new Variable(); + vars_[name] = v; + VLOG(3) << "Create variable " << name; + v->name_ = &(vars_.find(name)->first); + return v; +} + +Variable* Scope::Var(std::string* name) { + auto var_name = string::Sprintf("%p.%d", this, vars_.size()); + if (name != nullptr) { + *name = var_name; + } + return Var(var_name); +} + +Variable* Scope::FindVar(const std::string& name) const { + auto var = FindVarLocally(name); + if (var != nullptr) { + return var; + } + return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); +} + +const Scope* Scope::FindScope(const Variable* var) const { + for (auto& kv : vars_) { + if (kv.second == var) { + return this; + } + } + return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); +} +void Scope::DropKids() { + for (Scope* s : kids_) delete s; + kids_.clear(); +} + +std::vector Scope::LocalVarNames() const { + std::vector known_vars; + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } + return known_vars; +} + +void Scope::DeleteScope(Scope* scope) { + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); + this->kids_.erase(it); + // When making memory benchmark on Fluid, we have to delete scope sync. + if (FLAGS_benchmark) { + delete scope; + } else { + Async([scope] { delete scope; }); + } +} + +void Scope::Rename(const std::string& origin_name, + const std::string& new_name) const { + auto origin_it = vars_.find(origin_name); + PADDLE_ENFORCE(origin_it != vars_.end(), + "Cannot find original variable with name %s", origin_name); + auto new_it = vars_.find(new_name); + PADDLE_ENFORCE(new_it == vars_.end(), + "The variable with name %s is already in the scope", new_name); + vars_[new_name] = origin_it->second; + vars_.erase(origin_it); +} + +std::string Scope::Rename(const std::string& origin_name) const { + auto var_name = string::Sprintf("%p.%d", this, vars_.size()); + Rename(origin_name, var_name); + return var_name; +} + +Variable* Scope::FindVarLocally(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) return it->second; + return nullptr; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h new file mode 100644 index 0000000000000000000000000000000000000000..2da9e0716e7c02dc8c5397e37746344dff8e429d --- /dev/null +++ b/paddle/fluid/framework/scope.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class Scope; + +/** + * @brief Scope that manage all variables. + * + * Scope is an association of a name to Variable. All variables belong to + * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. + * One net can run in different scopes and update different variable in the + * scope. + */ +class Scope { + public: + Scope() {} + ~Scope(); + + /// Create a sub-scope. Returns a reference other than a pointer so + /// to prevent from manual deletion. + /// Mark it to const because that new kid scope cannot change parent scope. + Scope& NewScope() const; + + /// Create a variable with given name if it doesn't exist. + Variable* Var(const std::string& name); + + /// Create a variable with a scope-unique name. + Variable* Var(std::string* name = nullptr); + + /// Find a variable in the scope or any of its ancestors. Returns + /// nullptr if cannot find. + Variable* FindVar(const std::string& name) const; + + const Scope& parent() const { return *parent_; } + + /// Find the scope or an ancestor scope that contains the given variable. + const Scope* FindScope(const Variable* var) const; + + void DeleteScope(Scope* scope); + + /// Drop all kids scopes belonged to this scope. + void DropKids(); + + // enumerate all the variables current contains. + std::vector LocalVarNames() const; + + // Rename variable to a new name + void Rename(const std::string& origin_name, + const std::string& new_name) const; + + // Rename variable to a new name and return the new name + std::string Rename(const std::string& origin_name) const; + + Variable* FindVarLocally(const std::string& name) const; + + private: + // Call Scope::NewScope for a sub-scope. + explicit Scope(Scope const* parent) : parent_(parent) {} + + mutable std::unordered_map vars_; + mutable std::list kids_; + Scope const* parent_{nullptr}; + + DISABLE_COPY_AND_ASSIGN(Scope); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d64acb130cb29eda34cb01ef0533c42f1f03dcf8 --- /dev/null +++ b/paddle/fluid/framework/scope_test.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/scope.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +using paddle::framework::Scope; +using paddle::framework::Variable; + +TEST(Scope, VarsShadowing) { + Scope s; + Scope& ss1 = s.NewScope(); + Scope& ss2 = s.NewScope(); + + Variable* v0 = s.Var("a"); + Variable* v1 = ss1.Var("a"); + + EXPECT_NE(v0, v1); + + EXPECT_EQ(v0, s.FindVar("a")); + EXPECT_EQ(v1, ss1.FindVar("a")); + EXPECT_EQ(v0, ss2.FindVar("a")); +} + +TEST(Scope, FindVar) { + Scope s; + Scope& ss = s.NewScope(); + + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_EQ(nullptr, ss.FindVar("a")); + + ss.Var("a"); + + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_NE(nullptr, ss.FindVar("a")); +} + +TEST(Scope, FindScope) { + Scope s; + Scope& ss = s.NewScope(); + Variable* v = s.Var("a"); + + EXPECT_EQ(&s, s.FindScope(v)); + EXPECT_EQ(&s, ss.FindScope(v)); +} + +TEST(Scope, GetAllNames) { + Scope s; + Variable* v = s.Var("a"); + EXPECT_EQ(&s, s.FindScope(v)); + + std::vector ans = s.LocalVarNames(); + std::string str; + for (auto& var : ans) { + str += var; + } + + EXPECT_STREQ("a", str.c_str()); +} diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5d9e9a4951877e031ea6fdf529676fcb21e202f --- /dev/null +++ b/paddle/fluid/framework/selected_rows.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + SerializeToStream(os, selected_rows.value(), dev_ctx); +} + +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, + const platform::DeviceContext& dev_ctx) { + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, rows information + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h new file mode 100644 index 0000000000000000000000000000000000000000..f1a263962b2efc1ca828dd2eeb45495334ac1047 --- /dev/null +++ b/paddle/fluid/framework/selected_rows.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class SelectedRows { + public: + SelectedRows(const std::vector& rows, const int64_t& height) + : rows_(rows), height_(height) { + value_.reset(new Tensor()); + } + + SelectedRows() { + height_ = 0; + value_.reset(new Tensor()); + } + + platform::Place place() const { return value_->place(); } + + const Tensor& value() const { return *value_; } + + Tensor* mutable_value() { return value_.get(); } + + int64_t height() const { return height_; } + + void set_height(int64_t height) { height_ = height; } + + const Vector& rows() const { return rows_; } + + Vector* mutable_rows() { return &rows_; } + + void set_rows(const Vector& rows) { rows_ = rows; } + + DDim GetCompleteDims() const { + std::vector dims = vectorize(value_->dims()); + dims[0] = height_; + return make_ddim(dims); + } + + private: + // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. + // SelectedRows are simplely concated when adding together. Until a + // SelectedRows add a Tensor, will the duplicate rows be handled. + Vector rows_; + std::unique_ptr value_{nullptr}; + int64_t height_; +}; + +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, + const platform::DeviceContext& dev_ctx); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d414f2a5934282b4d586e6a9f7f81e44afbc9305 --- /dev/null +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/selected_rows.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { + +class SelectedRowsTester : public ::testing::Test { + public: + virtual void SetUp() override { + std::vector rows{0, 4, 7}; + int64_t height = 10; + int64_t row_numel = 100; + selected_rows_.reset(new SelectedRows(rows, height)); + + Tensor* value = selected_rows_->mutable_value(); + value->mutable_data( + make_ddim({static_cast(rows.size()), row_numel}), place_); + } + + protected: + platform::CPUPlace place_; + std::unique_ptr selected_rows_{nullptr}; +}; + +TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } + +TEST_F(SelectedRowsTester, dims) { + ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100})); +} + +TEST_F(SelectedRowsTester, complete_dims) { + ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); +} + +TEST_F(SelectedRowsTester, SerializeAndDeseralize) { + SelectedRows dst_tensor; + platform::CPUDeviceContext cpu_ctx(place_); + std::ostringstream oss; + + SerializeToStream(oss, *selected_rows_, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, cpu_ctx); + + ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows()); + ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); + ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); + ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc new file mode 100644 index 0000000000000000000000000000000000000000..cfd2334f1af19023c607d364172a4176be10f622 --- /dev/null +++ b/paddle/fluid/framework/shape_inference.cc @@ -0,0 +1,140 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/shape_inference.h" +#include "grad_op_desc_maker.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +DDim InferShapeContext::GetInputDim(const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); +} + +std::vector InferShapeContext::GetInputsDim( + const std::string &name) const { + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); +} + +std::vector InferShapeContext::GetReaderDims( + const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader input '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->GetRepeatedDims(arg_names[0]); +} + +DDim InferShapeContext::GetInputsElementDim(const std::string &name, + int idx) const { + const std::vector &names = Inputs(name); + return this->GetDim(names[idx]); +} + +void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); +} + +void InferShapeContext::SetOutputsDim(const std::string &name, + const std::vector &dims) { + auto &names = Outputs(name); + SetDims(names, dims); +} + +void InferShapeContext::SetReaderDims(const std::string &name, + const std::vector &dims) { + const std::vector &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader output '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->SetRepeatedDims(arg_names[0], dims); +} + +std::vector InferShapeContext::GetInputVarPtrs( + const std::string &name) { + const std::vector arg_names = Inputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform( + arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { return this->GetVarPtr(name); }); + return res; +} + +std::vector InferShapeContext::GetOutputVarPtrs( + const std::string &name) { + const std::vector arg_names = Outputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform( + arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { return this->GetVarPtr(name); }); + return res; +} + +std::vector InferShapeContext::GetDims( + const std::vector &names) const { + std::vector ret; + ret.reserve(names.size()); + std::transform( + names.begin(), names.end(), std::back_inserter(ret), + [this](const std::string &name) { return this->GetDim(name); }); + return ret; +} + +void InferShapeContext::SetDims(const std::vector &names, + const std::vector &dims) { + size_t length = names.size(); + PADDLE_ENFORCE_EQ(length, dims.size()); + for (size_t i = 0; i < length; ++i) { + if (names[i] == framework::kEmptyVarName) { + continue; + } + SetDim(names[i], dims[i]); + } +} + +std::vector InferShapeContext::GetInputsVarType( + const std::string &name) const { + return GetVarTypes(Inputs(name)); +} + +std::vector InferShapeContext::GetOutputsVarType( + const std::string &name) const { + return GetVarTypes(Outputs(name)); +} + +std::vector InferShapeContext::GetVarTypes( + const std::vector &names) const { + std::vector retv; + retv.resize(names.size()); + std::transform(names.begin(), names.end(), retv.begin(), + std::bind(std::mem_fn(&InferShapeContext::GetVarType), this, + std::placeholders::_1)); + return retv; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..c907523325c8472f902517deebec9bc02168713c --- /dev/null +++ b/paddle/fluid/framework/shape_inference.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +using InferShapeVarPtr = boost::variant; + +class InferShapeContext { + public: + virtual ~InferShapeContext() = default; + virtual bool HasInput(const std::string &name) const = 0; + virtual bool HasOutput(const std::string &name) const = 0; + + std::vector GetInputsVarType( + const std::string &name) const; + std::vector GetOutputsVarType( + const std::string &name) const; + + virtual bool HasInputs(const std::string &name) const = 0; + virtual bool HasOutputs(const std::string &name) const = 0; + + DDim GetInputDim(const std::string &name) const; + std::vector GetInputsDim(const std::string &name) const; + std::vector GetReaderDims(const std::string &name) const; + DDim GetInputsElementDim(const std::string &name, int idx) const; + + void SetOutputDim(const std::string &name, const DDim &dim); + void SetOutputsDim(const std::string &name, const std::vector &dims); + void SetReaderDims(const std::string &name, const std::vector &dims); + + virtual AttrReader Attrs() const = 0; + virtual const std::vector &Inputs( + const std::string &name) const = 0; + virtual const std::vector &Outputs( + const std::string &name) const = 0; + + virtual void ShareLoD(const std::string &in, const std::string &out, + size_t i = 0, size_t j = 0) const = 0; + + virtual bool IsRuntime() const = 0; + + std::vector GetInputVarPtrs(const std::string &name); + std::vector GetOutputVarPtrs(const std::string &name); + + // Note: In while op, we need this to be public + void SetDims(const std::vector &names, + const std::vector &dims); + + protected: + virtual DDim GetDim(const std::string &name) const = 0; + virtual void SetDim(const std::string &name, const DDim &dim) = 0; + virtual std::vector GetRepeatedDims(const std::string &name) const = 0; + virtual void SetRepeatedDims(const std::string &name, + const std::vector &dims) = 0; + + std::vector GetDims(const std::vector &names) const; + + std::vector GetVarTypes( + const std::vector &names) const; + + virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0; + + virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..a56091d3c629c4cedc13c465c84a646dc02cd094 --- /dev/null +++ b/paddle/fluid/framework/tensor.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework {} +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..44d2c7dae943a06eeab8ab1a1565f62b11de0af1 --- /dev/null +++ b/paddle/fluid/framework/tensor.h @@ -0,0 +1,227 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace framework { + +class LoDTensor; + +class Tensor { + public: + template + friend struct EigenTensor; + + template + friend struct EigenMatrix; + + template + friend struct EigenVector; + + public: + Tensor() : offset_(0) {} + + /*! Constructor with place should only be used in pybind. */ + explicit Tensor(const platform::Place& place) : offset_(0) { + holder_->set_place(place); + } + + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /*! Return a pointer to constant memory block. */ + template + inline const T* data() const; + + inline bool IsInitialized() const; + + inline void switch_place(platform::Place new_place); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + inline void* mutable_data(platform::Place place, std::type_index type); + + inline void* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + /*! Return the numel of the memory block. */ + inline int64_t numel() const; + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! The internal of two tensors share the same memory block. */ + inline Tensor& ShareDataWith(const Tensor& src); + + /** + * @brief Return a sub-tensor of the given tensor. + * + * @param[in] begin_idx The index of the start row(inclusive) to slice. + * The index number begins from 0. + * @param[in] end_idx The index of the end row(exclusive) to slice. + * The index number begins from 0. + */ + inline Tensor Slice(int begin_idx, int end_idx) const; + + platform::Place place() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor not initialized yet when Tensor::place() is called."); + return holder_->place(); + } + + std::type_index type() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor not initialized yet when Tensor::type() is called."); + return holder_->type(); + } + + // memory size returns the holding memory size in byte. + size_t memory_size() const; + + inline void check_memory_size() const; + + inline DataLayout layout() const { return layout_; } + + inline void set_layout(const DataLayout layout) { layout_ = layout; } + + private: + friend class LoDTensor; + + /** + * @note Placeholder hides type T, so it doesn't appear as a template + * parameter of Variable. + */ + struct Placeholder { + virtual ~Placeholder() = default; + virtual void* ptr() const = 0; + virtual size_t size() const = 0; + virtual std::type_index type() const = 0; + virtual platform::Place place() const = 0; + virtual void set_type(std::type_index type) = 0; + virtual void set_place(platform::Place place) = 0; + }; + + template + struct PlaceholderImpl : public Placeholder { + PlaceholderImpl(Place place, size_t size, std::type_index type) + : ptr_(static_cast(memory::Alloc(place, size)), + memory::PODDeleter(place)), + place_(place), + size_(size), + type_(type) { + PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", + (is_cpu_place(place_) ? "CPU" : "GPU")); + } + + virtual size_t size() const { return size_; } + virtual platform::Place place() const { return place_; } + virtual void* ptr() const { return static_cast(ptr_.get()); } + virtual std::type_index type() const { return type_; } + virtual void set_type(std::type_index type) { type_ = type; } + virtual void set_place(platform::Place place) { place_ = place; } + + /*! the pointer of memory block. */ + std::unique_ptr> ptr_; + + /*! the place of memory block. */ + platform::Place place_; + + /*! the size of memory block. */ + size_t size_; + + /* the current type of memory */ + std::type_index type_; + }; + + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /** + * @brief points to elements dimensions. + * + * @note dims_ do not indicate the memory block size. + */ + + DDim dims_; + + /** + * @brief the layout of memory block, default is NHWC. + * + * @note the memory allocation order, describe how weight/data is stored + * For example, in 4-D Tensor(rank=4), there are three commonly + * used layout. They are + * NCHW, NHWC, CHWN. + * N,C,H,W for respectively the batch size, the number of + * feature maps, the height. + */ + + DataLayout layout_ = DataLayout::kNHWC; + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really begins. + */ + size_t offset_; +}; + +inline void Tensor::switch_place(platform::Place new_place) { + if (holder_->place() == new_place) { + return; + } + + // TODO(tonyyang-svail): do memcpy here. + PADDLE_THROW("Not Implemented"); +} + +} // namespace framework +} // namespace paddle + +#include "paddle/fluid/framework/tensor_impl.h" diff --git a/paddle/framework/tensor.md b/paddle/fluid/framework/tensor.md similarity index 100% rename from paddle/framework/tensor.md rename to paddle/fluid/framework/tensor.md diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..e69836292cd0f4ed99e87ee8e297021dac43b64f --- /dev/null +++ b/paddle/fluid/framework/tensor_impl.h @@ -0,0 +1,196 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +template +struct SizeOfTypeFunctor; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + if (typeid(T).hash_code() == type.hash_code()) { + return sizeof(T); + } else { + return 0UL; + } + } +}; + +template <> +struct SizeOfTypeFunctor<> { + size_t operator()(std::type_index type) const { return 0UL; } +}; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + SizeOfTypeFunctor head; + size_t head_size = head(type); + if (head_size != 0) { + return head_size; + } + SizeOfTypeFunctor tail; + return tail(type); + } +}; + +static inline size_t SizeOfType(std::type_index type) { + SizeOfTypeFunctor functor; + size_t size = functor(type); + PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); + return size; +} + +inline void Tensor::check_memory_size() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE_LE( + numel() * SizeOfType(type()), memory_size(), + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory.\n" + "or maybe the required data-type mismatches the data already stored."); +} + +inline size_t Tensor::memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; +} + +template +inline const T* Tensor::data() const { + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); + + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); +} + +inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } + +template +inline T* Tensor::data() { + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline T* Tensor::mutable_data(DDim dims, platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + Resize(dims); + return mutable_data(place); +} + +template +inline T* Tensor::mutable_data(platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data(place, typeid(T))); +} + +inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_ENFORCE_GT( + numel(), 0, + "When calling this method, the Tensor's numel must be larger than zero. " + "Please check Tensor::Resize has been called first."); + int64_t size = numel() * SizeOfType(type); + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_gpu_place(place)) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); + } +#else + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +inline void* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing"); + return mutable_data(place, holder_->type()); +} + +inline Tensor& Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; + return *this; +} + +inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, 0, + "The start row index must be greater than 0."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); + PADDLE_ENFORCE_LT( + begin_idx, end_idx, + "The start row index must be lesser than the end row index."); + + if (dims_[0] == 1) { + return *this; + } else { + size_t base = numel() / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + dst.set_layout(layout_); + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); + return dst; + } +} + +inline Tensor& Tensor::Resize(const DDim& dims) { + dims_ = dims; + return *this; +} + +inline const DDim& Tensor::dims() const { return dims_; } + +inline int64_t Tensor::numel() const { return product(dims_); } + +inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { + Tensor res; + res.ShareDataWith(src); + res.Resize(flatten_to_2d(src.dims(), num_col_dims)); + return res; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ed416e46f99f4d2ed50538a3e2c090ed8dd6fc3 --- /dev/null +++ b/paddle/fluid/framework/tensor_test.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/tensor.h" +#include +#include + +namespace framework = paddle::framework; +namespace platform = paddle::platform; + +TEST(Tensor, Dims) { + framework::Tensor tt; + tt.Resize({2, 3, 4}); + framework::DDim dims = tt.dims(); + ASSERT_EQ(arity(dims), 3); + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(i + 2, dims[i]); + } +} + +TEST(Tensor, DataAssert) { + framework::Tensor src_tensor; + + bool caught = false; + try { + src_tensor.data(); + } catch (platform::EnforceNotMet err) { + caught = true; + std::string msg = + "holder_ should not be null\nTensor holds no memory. Call " + "Tensor::mutable_data first."; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); +} + +TEST(Tensor, MutableData) { + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CPUPlace()); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + EXPECT_EQ(p1, p2); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CUDAPlace()); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CUDAPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CUDAPlace()); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CUDAPlace()); + EXPECT_EQ(p1, p2); + } +#endif +} + +TEST(Tensor, ShareDataWith) { + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + // Try to share data form uninitialized tensor + bool caught = false; + try { + dst_tensor.ShareDataWith(src_tensor); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = + "holder_ should not be null\nTensor holds no memory. Call " + "Tensor::mutable_data first."; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CPUPlace()); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CUDAPlace()); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } +#endif +} + +TEST(Tensor, Slice) { + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({5, 3, 4}), + platform::CPUPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(1, 3); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 3); + EXPECT_EQ(slice_dims[0], 2); + EXPECT_EQ(slice_dims[1], 3); + EXPECT_EQ(slice_dims[2], 4); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = reinterpret_cast( + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CPUPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::CUDAPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 2); + EXPECT_EQ(slice_dims[0], 4); + EXPECT_EQ(slice_dims[1], 9); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace())); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); + } +#endif +} + +TEST(Tensor, ReshapeToMatrix) { + framework::Tensor src; + int* src_ptr = src.mutable_data({2, 3, 4, 9}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { + src_ptr[i] = i; + } + framework::Tensor res = framework::ReshapeToMatrix(src, 2); + ASSERT_EQ(res.dims()[0], 2 * 3); + ASSERT_EQ(res.dims()[1], 4 * 9); +} + +TEST(Tensor, Layout) { + framework::Tensor src; + ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + src.set_layout(framework::DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); +} diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..537fb4614cac8bc898b277899f803a3b1846a00e --- /dev/null +++ b/paddle/fluid/framework/tensor_util.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace framework { +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + Copy(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu new file mode 100644 index 0000000000000000000000000000000000000000..537fb4614cac8bc898b277899f803a3b1846a00e --- /dev/null +++ b/paddle/fluid/framework/tensor_util.cu @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace framework { +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + Copy(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h new file mode 100644 index 0000000000000000000000000000000000000000..b7e772b6daad93dc915665d58d4a5722c74c0d2b --- /dev/null +++ b/paddle/fluid/framework/tensor_util.h @@ -0,0 +1,333 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/** + * @brief Copy the content of external tensor to a new place. + * + * @param[in] src The external tensor. + * @param[in] dst_place The dst place. + * @param[in] ctx The device context contains device resources. + * + * @note Copy supports CPU <-> GPU, GPU <-> GPU. + */ +inline void Copy(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, Tensor* dst) { + VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to " + << dst_place; + src.check_memory_size(); + + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief Wrapper on + * Copy(const Tensor& src, const platform::Place& dst_place, + * const platform::DeviceContext& ctx, Tensor* dst); + * + * @param[in] src The external tensor. + * @param[in] dst_place The dst place. + * + * @note Copy supports CPU <-> GPU, GPU <-> GPU. + */ +inline void Copy(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + if (platform::is_gpu_place(src.place())) { + dev_ctx = pool.Get(src.place()); + } else { + dev_ctx = pool.Get(dst_place); + } + Copy(src, dst_place, *dev_ctx, dst); +} + +/** + * @brief Copy the content of an external vector to a tensor. + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. + * + * * @note CopyFromVector will resize dst to an 1D tensor with the same + * size as src. + */ +template +inline void CopyFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, src_place, + src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + boost::get(dst_place), dst_ptr, src_place, src_ptr, + size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief CopyFromVector CPU vector -> CPU Tensor + */ +template +inline void CopyFromVector(const std::vector& src, Tensor* dst) { + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); +} + +/** + * @brief Copy the content of a tensor to a vector + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. + * + * * @note CopyFromVector assumes that the tensor has been resized + * before invoking. + */ +template +inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + boost::get(src.place()), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief CopyToVector CPUTensor <-> CPU Vector + */ +template +inline void CopyToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + PADDLE_ENFORCE(platform::is_cpu_place(src.place())); + + memory::Copy(dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size); +} + +// Returns true if a tensor contains NAN, i.e., Not A Number. +bool HasNAN(const framework::Tensor& tensor); + +// Returns true if a tensor contains Inf, i.e., Infinity. +bool HasInf(const framework::Tensor& tensor); + +inline void SerializeToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx) { + // TODO(typhoonzero): serialize to ostream + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::TensorDesc desc; + desc.set_data_type(framework::ToDataType(tensor.type())); + auto dims = framework::vectorize(tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.memory_size(); + auto* data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +struct DeserializedDataFunctor { + DeserializedDataFunctor(void** buf, Tensor* tensor, + const platform::Place& place) + : buf_(buf), tensor_(tensor), place_(place) {} + + template + void operator()() { + *buf_ = tensor_->mutable_data(place_); + } + + void** buf_; + Tensor* tensor_; + platform::Place place_; +}; + +inline void DeserializeFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + proto::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + void* buf; + auto ctx = platform::CPUDeviceContext(); + if (platform::is_gpu_place(dev_ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + Tensor cpu_tensor; + cpu_tensor.Resize(framework::make_ddim(dims)); + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); + is.read(static_cast(buf), cpu_tensor.memory_size()); + auto dst_place = dev_ctx.GetPlace(); + framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor); +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); + is.read(static_cast(buf), tensor->memory_size()); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8764c692e875328fc98a7b67a018014af487f394 --- /dev/null +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -0,0 +1,309 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/tensor_util.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +TEST(Copy, Tensor) { + Tensor src_tensor; + Tensor dst_tensor; + platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + src_tensor.set_layout(DataLayout::kAnyLayout); + + auto cpu_place = new platform::CPUPlace(); + Copy(src_tensor, *cpu_place, &dst_tensor); + + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + + Tensor slice_tensor = src_tensor.Slice(1, 2); + Copy(slice_tensor, *cpu_place, &dst_tensor); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + +#ifdef PADDLE_WITH_CUDA + { + Tensor src_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + auto gpu_place = new platform::CUDAPlace(0); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + auto cpu_place = new platform::CPUPlace(); + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + } +#endif +} + +TEST(CopyFromVector, Tensor) { + using namespace paddle::framework; + using namespace paddle::platform; + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor cpu_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + CopyFromVector(src_vec, &cpu_tensor); + + // Compare Tensors + const int* cpu_ptr = cpu_tensor.data(); + const int* src_ptr = src_vec.data(); + ASSERT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + cpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, &cpu_tensor); + cpu_ptr = cpu_tensor.data(); + src_ptr = src_vec.data(); + ASSERT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + delete cpu_place; + } + +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor cpu_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + CPUDeviceContext cpu_ctx(*cpu_place); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + + // Copy to GPUTensor + gpu_tensor.Resize(make_ddim({3, 3})); + auto gpu_place = new paddle::platform::CUDAPlace(); + CUDADeviceContext gpu_ctx(*gpu_place); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + // Copy from GPU to CPU tensor for comparison + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* src_ptr = src_vec.data(); + const int* cpu_ptr = cpu_tensor.data(); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, cpu_ptr); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + + cpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + gpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + src_ptr = src_vec.data(); + cpu_ptr = cpu_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, cpu_ptr); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + delete cpu_place; + delete gpu_place; + } +#endif +} + +TEST(CopyToVector, Tensor) { + using namespace paddle::framework; + using namespace paddle::platform; + { + Tensor src; + int* src_ptr = src.mutable_data({3, 3}, CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = i; + } + + CPUPlace place; + std::vector dst; + CopyToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor gpu_tensor; + CUDAPlace place; + CUDADeviceContext gpu_ctx(place); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + CopyToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + +TEST(HasNAN, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + float* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + + ASSERT_TRUE(HasNAN(src)); +} + +TEST(HasInf, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + double* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + ASSERT_TRUE(HasInf(src)); +} + +TEST(Tensor, SerializeAndDeserialize) { + framework::Tensor src_tensor; + int array[6] = {1, 2, 3, 4, 5, 6}; + src_tensor.Resize({2, 3}); + int* src_ptr = src_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + src_ptr[i] = array[i]; + } + { + framework::Tensor dst_tensor; + auto place = new platform::CPUPlace(); + platform::CPUDeviceContext cpu_ctx(*place); + std::ostringstream oss; + SerializeToStream(oss, src_tensor, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, cpu_ctx); + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 5; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + ASSERT_EQ(dst_tensor.dims(), src_tensor.dims()); + delete place; + } +#ifdef PADDLE_WITH_CUDA + { + Tensor gpu_tensor; + gpu_tensor.Resize({2, 3}); + Tensor dst_tensor; + + auto gpu_place = new platform::CUDAPlace(); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + + Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + std::ostringstream oss; + SerializeToStream(oss, gpu_tensor, gpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, gpu_ctx); + + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + delete gpu_place; + } +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..1982b642bcd1f2e21a684b701d7bd603f0c2c894 --- /dev/null +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +static __global__ void FillNAN(float* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = NAN; +} +static __global__ void FillInf(float* buf) { + buf[0] = 0.0; + buf[1] = INFINITY; + buf[2] = 0.5; +} + +TEST(HasNAN, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasNAN(tensor)); +} + +TEST(HasInf, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasInf(tensor)); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c4de41b0c41fa3eeaf6c77def9c728dd9976895 --- /dev/null +++ b/paddle/fluid/framework/threadpool.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/threadpool.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +std::unique_ptr ThreadPool::threadpool_(nullptr); +std::once_flag ThreadPool::init_flag_; + +ThreadPool* ThreadPool::GetInstance() { + std::call_once(init_flag_, &ThreadPool::Init); + return threadpool_.get(); +} + +void ThreadPool::Init() { + if (threadpool_.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool_.reset(new ThreadPool(num_threads)); + } +} + +ThreadPool::ThreadPool(int num_threads) + : total_threads_(num_threads), idle_threads_(num_threads), running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } +} + +ThreadPool::~ThreadPool() { + { + // notify all threads to stop running + running_ = false; + scheduled_.notify_all(); + } + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } +} + +void ThreadPool::Wait() { + std::unique_lock lock(mutex_); + completed_.wait(lock, [=] { return Done() == true; }); +} + +void ThreadPool::TaskLoop() { + while (running_) { + std::unique_lock lock(mutex_); + scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); + + if (!running_) { + break; + } + // pop a task from the task queue + auto task = std::move(tasks_.front()); + tasks_.pop(); + + --idle_threads_; + lock.unlock(); + + // run the task + task(); + + { + std::unique_lock lock(mutex_); + ++idle_threads_; + if (Done()) { + completed_.notify_all(); + } + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..e88e6c01f02deb77278a02ba81ee62ddfcf42eb8 --- /dev/null +++ b/paddle/fluid/framework/threadpool.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +// ThreadPool maintains a queue of tasks, and runs them using a fixed +// number of threads. +class ThreadPool { + public: + using Task = std::packaged_task()>; + + // Returns the singleton of ThreadPool. + static ThreadPool* GetInstance(); + + ~ThreadPool(); + + // Returns the number of threads created by the constructor. + size_t Threads() const { return total_threads_; } + + // Returns the number of currently idle threads. + size_t IdleThreads() { + std::unique_lock lock(mutex_); + return idle_threads_; + } + + // Run pushes a function to the task queue and returns a std::future + // object. To wait for the completion of the task, call + // std::future::wait(). + template + std::future Run(Callback fn) { + auto f = this->RunAndGetException(fn); + return std::async(std::launch::deferred, ExceptionHandler(std::move(f))); + } + + template + std::future> RunAndGetException( + Callback fn) { + std::unique_lock lock(mutex_); + Task task([fn]() -> std::unique_ptr { + try { + fn(); + return nullptr; + } catch (platform::EnforceNotMet ex) { + return std::unique_ptr( + new platform::EnforceNotMet(ex)); + } catch (...) { + LOG(FATAL) + << "Unexpected exception is catched in thread pool. All " + "throwable exception in Fluid should be an EnforceNotMet."; + } + }); + std::future> f = task.get_future(); + tasks_.push(std::move(task)); + lock.unlock(); + scheduled_.notify_one(); + return f; + } + + // Wait until all the tasks are completed. + void Wait(); + + private: + struct ExceptionHandler { + mutable std::future> future_; + explicit ExceptionHandler( + std::future>&& f) + : future_(std::move(f)) {} + void operator()() const { + auto ex = this->future_.get(); + if (ex != nullptr) { + LOG(FATAL) << "The exception is thrown inside the thread pool. You " + "should use RunAndGetException to handle the exception.\n" + "The default exception handler is LOG(FATAL)." + << ex->what(); + } + } + }; + + DISABLE_COPY_AND_ASSIGN(ThreadPool); + + explicit ThreadPool(int num_threads); + + // If the task queue is empty and avaialbe is equal to the number of + // threads, means that all tasks are completed. Note: this function + // is not thread-safe. Returns true if all tasks are completed. + // Note: don't delete the data member total_threads_ and use + // threads_.size() instead; because you'd need to lock the mutex + // before accessing threads_. + bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; } + + // The constructor starts threads to run TaskLoop, which retrieves + // and runs tasks from the queue. + void TaskLoop(); + + // Init is called by GetInstance. + static void Init(); + + private: + static std::unique_ptr threadpool_; + static std::once_flag init_flag_; + + std::vector> threads_; + const size_t total_threads_; + size_t idle_threads_; + + std::queue tasks_; + std::mutex mutex_; + bool running_; + std::condition_variable scheduled_; + std::condition_variable completed_; +}; + +// Run a function asynchronously. +// NOTE: The function must return void. If the function need to return a value, +// you can use lambda to capture a value pointer. +template +std::future Async(Callback callback) { + return ThreadPool::GetInstance()->Run(callback); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc similarity index 100% rename from paddle/framework/threadpool_test.cc rename to paddle/fluid/framework/threadpool_test.cc diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..786d78a6440de60abea44f7b8fccb90d455b488c --- /dev/null +++ b/paddle/fluid/framework/type_defs.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { +class OperatorBase; +class OpDesc; +class InferShapeContext; +class BlockDesc; + +using VariableNameMap = std::map>; + +// The order should be as same as framework.proto +using Attribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t>; + +using AttributeMap = std::unordered_map; + +using OpCreator = std::function; + +using GradOpMakerFN = std::function>( + const OpDesc&, const std::unordered_set& /*no_grad_set*/, + std::unordered_map* /*grad_to_var*/, + const std::vector& grad_block)>; + +using InferVarTypeFN = + std::function; + +using InferShapeFN = std::function; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ec9b2ced94c4176b64996827dcb79f1d756be6b --- /dev/null +++ b/paddle/fluid/framework/var_desc.cc @@ -0,0 +1,259 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); } + +void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); } + +void VarDesc::SetShape(const std::vector &dims) { + VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); +} + +void VarDesc::SetTensorDescNum(size_t num) { + switch (desc_.type()) { + case proto::VarDesc::READER: { + auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor(); + lod_tensors_ptr->Clear(); + for (size_t i = 0; i < num; ++i) { + lod_tensors_ptr->Add(); + } + return; + } break; + default: + PADDLE_THROW( + "Setting 'sub_tensor_number' is not supported by the type of var %s.", + this->Name()); + } +} + +size_t VarDesc::GetTensorDescNum() const { + switch (desc_.type()) { + case proto::VarDesc::READER: + return desc_.reader().lod_tensor_size(); + break; + default: + PADDLE_THROW( + "Getting 'sub_tensor_number' is not supported by the type of var %s.", + this->Name()); + } +} + +void VarDesc::SetShapes( + const std::vector> &multiple_dims) { + if (multiple_dims.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_dims.size()); + } + std::vector tensors = mutable_tensor_descs(); + for (size_t i = 0; i < multiple_dims.size(); ++i) { + VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims()); + } +} + +std::vector VarDesc::GetShape() const { + return RepeatedToVector(tensor_desc().dims()); +} + +std::vector> VarDesc::GetShapes() const { + std::vector descs = tensor_descs(); + std::vector> res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(RepeatedToVector(tensor_desc.dims())); + } + return res; +} + +void VarDesc::SetDataType(proto::DataType data_type) { + mutable_tensor_desc()->set_data_type(data_type); +} + +void VarDesc::SetDataTypes( + const std::vector &multiple_data_type) { + if (multiple_data_type.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given data types(" + << multiple_data_type.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_data_type.size()); + } + std::vector tensor_descs = mutable_tensor_descs(); + for (size_t i = 0; i < multiple_data_type.size(); ++i) { + tensor_descs[i]->set_data_type(multiple_data_type[i]); + } +} + +proto::DataType VarDesc::GetDataType() const { + return tensor_desc().data_type(); +} + +std::vector VarDesc::GetDataTypes() const { + std::vector descs = tensor_descs(); + std::vector res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(tensor_desc.data_type()); + } + return res; +} + +void VarDesc::SetLoDLevel(int32_t lod_level) { + switch (desc_.type()) { + case proto::VarDesc::LOD_TENSOR: + desc_.mutable_lod_tensor()->set_lod_level(lod_level); + break; + case proto::VarDesc::LOD_TENSOR_ARRAY: + desc_.mutable_tensor_array()->set_lod_level(lod_level); + break; + default: + PADDLE_THROW( + "Setting 'lod_level' is not supported by the type of var %s.", + this->Name()); + } +} + +void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { + if (multiple_lod_level.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given lod_levels(" + << multiple_lod_level.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_lod_level.size()); + } + switch (desc_.type()) { + case proto::VarDesc::READER: { + size_t i = 0; + for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) { + lod_tensor.set_lod_level(multiple_lod_level[i++]); + } + } break; + default: + PADDLE_THROW( + "Setting 'lod_levels' is not supported by the type of var %s.", + this->Name()); + } +} + +int32_t VarDesc::GetLoDLevel() const { + switch (desc_.type()) { + case proto::VarDesc::LOD_TENSOR: + return desc_.lod_tensor().lod_level(); + case proto::VarDesc::LOD_TENSOR_ARRAY: + return desc_.tensor_array().lod_level(); + default: + PADDLE_THROW( + "Getting 'lod_level' is not supported by the type of var %s.", + this->Name()); + } +} + +std::vector VarDesc::GetLoDLevels() const { + std::vector res; + switch (desc_.type()) { + case proto::VarDesc::READER: + res.reserve(desc_.reader().lod_tensor_size()); + for (auto &lod_tensor : desc_.reader().lod_tensor()) { + res.push_back(lod_tensor.lod_level()); + } + return res; + break; + default: + PADDLE_THROW( + "Getting 'lod_levels' is not supported by the type of var %s.", + this->Name()); + } +} + +const proto::TensorDesc &VarDesc::tensor_desc() const { + PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); + switch (desc_.type()) { + case proto::VarDesc::SELECTED_ROWS: + return desc_.selected_rows(); + case proto::VarDesc::LOD_TENSOR: + return desc_.lod_tensor().tensor(); + case proto::VarDesc::LOD_TENSOR_ARRAY: + return desc_.tensor_array().tensor(); + default: + PADDLE_THROW( + "Getting 'tensor_desc' is not supported by the type of var %s.", + this->Name()); + } +} + +std::vector VarDesc::tensor_descs() const { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_.type()) { + case proto::VarDesc::READER: + for (const auto &lod_tensor : desc_.reader().lod_tensor()) { + res.push_back(lod_tensor.tensor()); + } + return res; + default: + PADDLE_THROW( + "Getting 'tensor_descs' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +proto::TensorDesc *VarDesc::mutable_tensor_desc() { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + switch (desc_.type()) { + case proto::VarDesc::SELECTED_ROWS: + return desc_.mutable_selected_rows(); + case proto::VarDesc::LOD_TENSOR: + return desc_.mutable_lod_tensor()->mutable_tensor(); + case proto::VarDesc::LOD_TENSOR_ARRAY: + return desc_.mutable_tensor_array()->mutable_tensor(); + default: + PADDLE_THROW( + "Getting 'mutable_tensor_desc' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +std::vector VarDesc::mutable_tensor_descs() { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_.type()) { + case proto::VarDesc::READER: + for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) { + res.push_back(lod_tensor.mutable_tensor()); + } + return res; + default: + PADDLE_THROW( + "Getting 'tensor_descs' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..cdb1bc3ec09c890f2166190d591b6e6ee8b668a0 --- /dev/null +++ b/paddle/fluid/framework/var_desc.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace framework { + +// convert between std::vector and protobuf repeated. +template +inline std::vector RepeatedToVector( + const google::protobuf::RepeatedField &repeated_field) { + std::vector ret; + ret.reserve(repeated_field.size()); + std::copy(repeated_field.begin(), repeated_field.end(), + std::back_inserter(ret)); + return ret; +} + +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (const auto &elem : vec) { + *repeated_field->Add() = elem; + } +} + +// Specialize vector. +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (auto elem : vec) { + *repeated_field->Add() = elem; + } +} + +class VarDesc { + public: + explicit VarDesc(const std::string &name) { + desc_.set_name(name); + desc_.set_type(proto::VarDesc::LOD_TENSOR); + } + + explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {} + + proto::VarDesc *Proto() { return &desc_; } + + std::string Name() const { return desc_.name(); } + + void SetName(std::string name) { desc_.set_name(name); } + + void SetTensorDescNum(size_t num); + + size_t GetTensorDescNum() const; + + void SetShape(const std::vector &dims); + + void SetShapes(const std::vector> &multiple_dims); + + std::vector GetShape() const; + + std::vector> GetShapes() const; + + void SetDataType(proto::DataType data_type); + + void SetDataTypes(const std::vector &multiple_data_type); + + proto::DataType GetDataType() const; + + std::vector GetDataTypes() const; + + void SetLoDLevel(int32_t lod_level); + + void SetLoDLevels(const std::vector &multiple_lod_level); + + int32_t GetLoDLevel() const; + + std::vector GetLoDLevels() const; + + proto::VarDesc::VarType GetType() const; + + void SetType(proto::VarDesc::VarType type); + + bool Persistable() const { return desc_.persistable(); } + + void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } + + private: + const proto::TensorDesc &tensor_desc() const; + std::vector tensor_descs() const; + proto::TensorDesc *mutable_tensor_desc(); + std::vector mutable_tensor_descs(); + + proto::VarDesc desc_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h new file mode 100644 index 0000000000000000000000000000000000000000..2dc4de529814bb0f7a5193b8e216343a4b1b3503 --- /dev/null +++ b/paddle/fluid/framework/var_type.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { +inline proto::VarDesc::VarType ToVarType(std::type_index type) { + if (type.hash_code() == typeid(LoDTensor).hash_code()) { + return proto::VarDesc_VarType_LOD_TENSOR; + } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) { + return proto::VarDesc_VarType_LOD_RANK_TABLE; + } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) { + return proto::VarDesc_VarType_LOD_TENSOR_ARRAY; + } else if (type.hash_code() == typeid(SelectedRows).hash_code()) { + return proto::VarDesc_VarType_SELECTED_ROWS; + } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) { + return proto::VarDesc_VarType_READER; + } else { + PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); + } +} + +template +inline void VisitVarType(const framework::Variable& var, Visitor visitor) { + switch (ToVarType(var.Type())) { + case proto::VarDesc_VarType_LOD_TENSOR: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_LOD_RANK_TABLE: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_LOD_TENSOR_ARRAY: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_SELECTED_ROWS: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_READER: + visitor(var.Get()); + return; + default: + PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..44fd4cd622cbada7f10ade8928a69d4e3d1d9ec0 --- /dev/null +++ b/paddle/fluid/framework/var_type_inference.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +class VarTypeInference { + public: + virtual ~VarTypeInference() {} + virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ee589c821a77af7f6714fefd7bebff89218dad8 --- /dev/null +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/var_type_inference.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + SumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SumOpVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc &op_desc, BlockDesc *block) const override { + auto &inputs = op_desc.Input("X"); + auto default_var_type = proto::VarDesc::SELECTED_ROWS; + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [block](const std::string &name) { + return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR; + }); + if (any_input_is_lod_tensor) { + default_var_type = proto::VarDesc::LOD_TENSOR; + } + + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(default_var_type); + } +}; +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP, + paddle::framework::SumOpMaker); + +namespace paddle { +namespace framework { + +TEST(InferVarType, sum_op) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"test_a", "test_b", "test_c"}); + op->SetOutput("Out", {"test_out"}); + + prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarDesc::SELECTED_ROWS, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR); + op->InferVarType(prog.MutableBlock(0)); + ASSERT_EQ(proto::VarDesc::LOD_TENSOR, + prog.MutableBlock(0)->Var("test_out")->GetType()); +} + +TEST(InferVarType, sum_op_without_infer_var_type) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum_without_infer_var_type"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR, + prog.MutableBlock(0)->Var("test2_out")->GetType()); +} + +} // namespace framework +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h new file mode 100644 index 0000000000000000000000000000000000000000..9fb8ca92d68203a0bf8ec6ecd30072374b5fe4af --- /dev/null +++ b/paddle/fluid/framework/variable.h @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +class Variable { + public: + template + const T& Get() const { + PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); + PADDLE_ENFORCE(IsType(), + "Variable must be type %s, the holding type is %s", + typeid(T).name(), holder_->Type().name()); + return *static_cast(holder_->Ptr()); + } + + bool IsInitialized() const { return holder_ != nullptr; } + + template + T* GetMutable() { + if (!IsType()) { + holder_.reset(new PlaceholderImpl(new T())); + } + return static_cast(holder_->Ptr()); + } + + template + bool IsType() const { + return holder_ != nullptr && + std::type_index(typeid(T)) == std::type_index(holder_->Type()); + } + + void Clear() { holder_.reset(); } + + std::type_index Type() const { + PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); + return holder_->Type(); + } + + private: + struct Placeholder { + virtual ~Placeholder() {} + virtual const std::type_info& Type() const = 0; + virtual void* Ptr() const = 0; + }; + + // Placeholder hides type T, so it doesn't appear as a template + // parameter of Variable. + template + struct PlaceholderImpl : public Placeholder { + PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} + + virtual const std::type_info& Type() const { return type_; } + virtual void* Ptr() const { return static_cast(ptr_.get()); } + + std::unique_ptr ptr_; + const std::type_info& type_; + }; + + std::unique_ptr + holder_; // pointers to a PlaceholderImpl object indeed. + + // name_ is only meaningful with a Scope and accessible by it. + // + // NOTE: Please don't expose name_ by adding methods like + // Variable::Name or Scope::VarName! A variable could have a human + // readable name or an auto-generated scope-unique name. In the + // former case, the caller knows the name and doesn't need to access + // the name; in the latter case, the variable should be identified + // by its address but not the unreadable name. + friend class Scope; + const std::string* name_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/variable.md b/paddle/fluid/framework/variable.md similarity index 100% rename from paddle/framework/variable.md rename to paddle/fluid/framework/variable.md diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8c14e506fd7fd480012135b316479e45bed5584e --- /dev/null +++ b/paddle/fluid/framework/variable_test.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/variable.h" + +TEST(Variable, GetMutable) { + using paddle::framework::Variable; + + struct Tensor { + int content_; + }; + + std::unique_ptr v(new Variable()); + + Tensor* t = v->GetMutable(); + t->content_ = 1234; + + const Tensor& tt = v->Get(); + EXPECT_EQ(1234, tt.content_); + + std::string* s = v->GetMutable(); + *s = "hello"; + + const std::string& ss = v->Get(); + EXPECT_EQ("hello", ss); +} diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdb147955ca0700dc0854b54c38d961caf8845f3 --- /dev/null +++ b/paddle/fluid/inference/CMakeLists.txt @@ -0,0 +1,18 @@ +set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init) + +cc_library(paddle_fluid_api + SRCS io.cc + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) + +# Create static library +cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) + +# Create shared library +cc_library(paddle_fluid_shared SHARED + SRCS io.cc + DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END) +set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) + +if(WITH_TESTING) + add_subdirectory(tests/book) +endif() diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..58d7ab40bfa67595a9c7c61ed431a7cf9509e1f7 --- /dev/null +++ b/paddle/fluid/inference/io.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/io.h" + +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace inference { + +void ReadBinaryFile(const std::string& filename, std::string& contents) { + VLOG(3) << "loading model from " << filename; + std::ifstream inputfs(filename, std::ios::in | std::ios::binary); + inputfs.seekg(0, std::ios::end); + contents.clear(); + contents.resize(inputfs.tellg()); + inputfs.seekg(0, std::ios::beg); + inputfs.read(&contents[0], contents.size()); + inputfs.close(); +} + +bool IsParameter(const framework::VarDesc* var, + const framework::ProgramDesc& main_program) { + if (var->Persistable()) { + // There are many unreachable variables in the program + for (size_t i = 0; i < main_program.Size(); ++i) { + const framework::BlockDesc& block = main_program.Block(i); + for (auto* op : block.AllOps()) { + if (op->Type() == framework::kFeedOpType) { + continue; + } + for (auto input_argument_name : op->InputArgumentNames()) { + if (input_argument_name == var->Name()) { + return true; + } + } + } + } + } + return false; +} + +void LoadPersistables(framework::Executor& executor, + framework::Scope& scope, + const framework::ProgramDesc& main_program, + const std::string& dirname, + const std::string& param_filename) { + const framework::BlockDesc& global_block = main_program.Block(0); + + framework::ProgramDesc* load_program = new framework::ProgramDesc(); + framework::BlockDesc* load_block = load_program->MutableBlock(0); + std::vector paramlist; + + for (auto* var : global_block.AllVars()) { + if (IsParameter(var, main_program)) { + VLOG(3) << "parameter's name: " << var->Name(); + + framework::VarDesc* new_var = load_block->Var(var->Name()); + new_var->SetShape(var->GetShape()); + new_var->SetDataType(var->GetDataType()); + new_var->SetType(var->GetType()); + new_var->SetLoDLevel(var->GetLoDLevel()); + new_var->SetPersistable(true); + + if (!param_filename.empty()) { + paramlist.push_back(new_var->Name()); + } else { + // append_op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load"); + op->SetOutput("Out", {new_var->Name()}); + op->SetAttr("file_path", {dirname + "/" + new_var->Name()}); + op->CheckAttrs(); + } + } + } + + if (!param_filename.empty()) { + // sort paramlist to have consistent ordering + std::sort(paramlist.begin(), paramlist.end()); + // append just the load_combine op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load_combine"); + op->SetOutput("Out", paramlist); + op->SetAttr("file_path", {param_filename}); + op->CheckAttrs(); + } + + executor.Run(*load_program, &scope, 0, true, true); + + VLOG(3) << "Ran loading successfully"; + delete load_program; +} + +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& dirname) { + std::string model_filename = dirname + "/__model__"; + std::string program_desc_str; + ReadBinaryFile(model_filename, program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + + LoadPersistables(executor, scope, *main_program, dirname, ""); + return main_program; +} + +std::unique_ptr Load( + framework::Executor& executor, + framework::Scope& scope, + const std::string& prog_filename, + const std::string& param_filename) { + std::string model_filename = prog_filename; + std::string program_desc_str; + ReadBinaryFile(model_filename, program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + + LoadPersistables(executor, scope, *main_program, "", param_filename); + return main_program; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h new file mode 100644 index 0000000000000000000000000000000000000000..9d7864060646d9a480ce6ced6a1f4364e83938c0 --- /dev/null +++ b/paddle/fluid/inference/io.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace inference { + +void LoadPersistables(framework::Executor& executor, + framework::Scope& scope, + const framework::ProgramDesc& main_program, + const std::string& dirname, + const std::string& param_filename); + +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& dirname); + +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& prog_filename, + const std::string& param_filename); + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fe76afb582a13b741ab086f0c62d77e86d4e8bb --- /dev/null +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -0,0 +1,34 @@ +function(inference_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs ARGS) + cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests) + set(arg_list "") + if(inference_test_ARGS) + foreach(arg ${inference_test_ARGS}) + list(APPEND arg_list "_${arg}") + endforeach() + else() + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_inference_${TARGET_NAME}${arg} + SRCS test_inference_${TARGET_NAME}.cc + DEPS ARCHIVE_START paddle_fluid ARCHIVE_END + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) + set_tests_properties(test_inference_${TARGET_NAME}${arg} + PROPERTIES DEPENDS test_${TARGET_NAME}) + endforeach() +endfunction(inference_test) + +inference_test(fit_a_line) +inference_test(image_classification ARGS vgg resnet) +inference_test(label_semantic_roles) +inference_test(recognize_digits ARGS mlp) +inference_test(recommender_system) +inference_test(rnn_encoder_decoder) +inference_test(understand_sentiment) +inference_test(word2vec) diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa18e69b3ac7e984172dd14a3cb8d48158dfb471 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, fit_a_line) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + // The second dim of the input tensor should be 13 + // The input data should be >= 0 + int64_t batch_size = 10; + SetupTensor( + input, {batch_size, 13}, static_cast(0), static_cast(10)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc new file mode 100644 index 0000000000000000000000000000000000000000..27f17712bca4103e2556cb375339ca785f53bd4f --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, image_classification) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [0.0, 1.0]. + SetupTensor(input, + {batch_size, 3, 32, 32}, + static_cast(0), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc new file mode 100644 index 0000000000000000000000000000000000000000..55acd95f50906b13a5a906e0bcc2e73a0c7f8ef2 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, label_semantic_roles) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, + ctx_p2, mark; + paddle::framework::LoD lod{{0, 4, 10}}; + + SetupLoDTensor(word, lod, static_cast(0), static_cast(1)); + SetupLoDTensor( + predicate, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_n2, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_n1, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_0, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_p1, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_p2, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(mark, lod, static_cast(0), static_cast(1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word); + cpu_feeds.push_back(&predicate); + cpu_feeds.push_back(&ctx_n2); + cpu_feeds.push_back(&ctx_n1); + cpu_feeds.push_back(&ctx_0); + cpu_feeds.push_back(&ctx_p1); + cpu_feeds.push_back(&ctx_p2); + cpu_feeds.push_back(&mark); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc new file mode 100644 index 0000000000000000000000000000000000000000..99cf0f3095bf7f93d53272e0ae13242484d7128c --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, recognize_digits) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [-1.0, 1.0]. + SetupTensor(input, + {batch_size, 1, 28, 28}, + static_cast(-1), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} + +TEST(inference, recognize_digits_combine) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [-1.0, 1.0]. + SetupTensor( + input, {1, 28, 28}, static_cast(-1), static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference( + dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference( + dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc new file mode 100644 index 0000000000000000000000000000000000000000..9208c2a59965ad5296238a23e89cd290b5e19740 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, recommender_system) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id, + category_id, movie_title; + + // Use the first data from paddle.dataset.movielens.test() as input + std::vector user_id_data = {1}; + SetupTensor(user_id, {batch_size, 1}, user_id_data); + + std::vector gender_id_data = {1}; + SetupTensor(gender_id, {batch_size, 1}, gender_id_data); + + std::vector age_id_data = {0}; + SetupTensor(age_id, {batch_size, 1}, age_id_data); + + std::vector job_id_data = {10}; + SetupTensor(job_id, {batch_size, 1}, job_id_data); + + std::vector movie_id_data = {783}; + SetupTensor(movie_id, {batch_size, 1}, movie_id_data); + + std::vector category_id_data = {10, 8, 9}; + SetupLoDTensor(category_id, {3, 1}, {{0, 3}}, category_id_data); + + std::vector movie_title_data = {1069, 4140, 2923, 710, 988}; + SetupLoDTensor(movie_title, {5, 1}, {{0, 5}}, movie_title_data); + + std::vector cpu_feeds; + cpu_feeds.push_back(&user_id); + cpu_feeds.push_back(&gender_id); + cpu_feeds.push_back(&age_id); + cpu_feeds.push_back(&job_id); + cpu_feeds.push_back(&movie_id); + cpu_feeds.push_back(&category_id); + cpu_feeds.push_back(&movie_title); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc new file mode 100644 index 0000000000000000000000000000000000000000..c88ca30cb781c1980d960c5e4e1137dcfd54afac --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, rnn_encoder_decoder) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word_data, trg_word; + paddle::framework::LoD lod{{0, 4, 10}}; + + SetupLoDTensor( + word_data, lod, static_cast(0), static_cast(1)); + SetupLoDTensor( + trg_word, lod, static_cast(0), static_cast(1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word_data); + cpu_feeds.push_back(&trg_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b29d52880cef1710696074ed8b2fdecf4f9fcca --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, understand_sentiment) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor words; + paddle::framework::LoD lod{{0, 4, 10}}; + SetupLoDTensor(words, lod, static_cast(0), static_cast(10)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&words); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc new file mode 100644 index 0000000000000000000000000000000000000000..93376b6824daf000dd9996c17ca9737b5b600e10 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, word2vec) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word; + paddle::framework::LoD lod{{0, 1}}; + int64_t dict_size = 2072; // Hard-coding the size of dictionary + + SetupLoDTensor(first_word, lod, static_cast(0), dict_size); + SetupLoDTensor(second_word, lod, static_cast(0), dict_size); + SetupLoDTensor(third_word, lod, static_cast(0), dict_size); + SetupLoDTensor(fourth_word, lod, static_cast(0), dict_size); + + std::vector cpu_feeds; + cpu_feeds.push_back(&first_word); + cpu_feeds.push_back(&second_word); + cpu_feeds.push_back(&third_word); + cpu_feeds.push_back(&fourth_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..a6c93aa0737f79ca1d626862256d3c79a36868ae --- /dev/null +++ b/paddle/fluid/inference/tests/test_helper.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/io.h" + +template +void SetupTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + T lower, + T upper) { + srand(time(0)); + T* input_ptr = input.mutable_data(dims, paddle::platform::CPUPlace()); + for (int i = 0; i < input.numel(); ++i) { + input_ptr[i] = + (static_cast(rand()) / static_cast(RAND_MAX)) * (upper - lower) + + lower; + } +} + +template +void SetupTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + std::vector& data) { + CHECK_EQ(paddle::framework::product(dims), static_cast(data.size())); + T* input_ptr = input.mutable_data(dims, paddle::platform::CPUPlace()); + memcpy(input_ptr, data.data(), input.numel() * sizeof(T)); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor& input, + paddle::framework::LoD& lod, + T lower, + T upper) { + input.set_lod(lod); + int dim = lod[0][lod[0].size() - 1]; + SetupTensor(input, {dim, 1}, lower, upper); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + paddle::framework::LoD lod, + std::vector& data) { + const size_t level = lod.size() - 1; + CHECK_EQ(dims[0], static_cast((lod[level]).back())); + input.set_lod(lod); + SetupTensor(input, dims, data); +} + +template +void CheckError(paddle::framework::LoDTensor& output1, + paddle::framework::LoDTensor& output2) { + // Check lod information + EXPECT_EQ(output1.lod(), output2.lod()); + + EXPECT_EQ(output1.dims(), output2.dims()); + EXPECT_EQ(output1.numel(), output2.numel()); + + T err = static_cast(0); + if (typeid(T) == typeid(float)) { + err = 1E-3; + } else if (typeid(T) == typeid(double)) { + err = 1E-6; + } else { + err = 0; + } + + size_t count = 0; + for (int64_t i = 0; i < output1.numel(); ++i) { + if (fabs(output1.data()[i] - output2.data()[i]) > err) { + count++; + } + } + EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; +} + +template +void TestInference(const std::string& dirname, + const std::vector& cpu_feeds, + std::vector& cpu_fetchs) { + // 1. Define place, executor, scope + auto place = Place(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + if (IsCombined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // Users are free to specify different filename + // (provided: the filenames are changed in the python api as well: io.py) + std::string prog_filename = "__model_combined__"; + std::string param_filename = "__params_combined__"; + inference_program = paddle::inference::Load(executor, + *scope, + dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified `dirname`. + inference_program = paddle::inference::Load(executor, *scope, dirname); + } + + // 3. Get the feed_target_names and fetch_target_names + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + + // 4. Prepare inputs: set up maps for feed targets + std::map feed_targets; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + // Please make sure that cpu_feeds[i] is right for feed_target_names[i] + feed_targets[feed_target_names[i]] = cpu_feeds[i]; + } + + // 5. Define Tensor to get the outputs: set up maps for fetch targets + std::map fetch_targets; + for (size_t i = 0; i < fetch_target_names.size(); ++i) { + fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; + } + + // 6. Run the inference program + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + + delete scope; +} diff --git a/paddle/platform/.clang-format b/paddle/fluid/memory/.clang-format similarity index 100% rename from paddle/platform/.clang-format rename to paddle/fluid/memory/.clang-format diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a61c484823b292234d4758cdc1959d7a21510e6 --- /dev/null +++ b/paddle/fluid/memory/CMakeLists.txt @@ -0,0 +1,16 @@ +add_subdirectory(detail) + +cc_library(memory SRCS memory.cc DEPS place enforce) +cc_library(memcpy SRCS memcpy.cc DEPS place) + +cc_library(paddle_memory + DEPS + memory + memcpy + meta_data + meta_cache + memory_block + buddy_allocator + system_allocator) + +cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) diff --git a/paddle/memory/README.md b/paddle/fluid/memory/README.md similarity index 100% rename from paddle/memory/README.md rename to paddle/fluid/memory/README.md diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt similarity index 100% rename from paddle/memory/detail/CMakeLists.txt rename to paddle/fluid/memory/detail/CMakeLists.txt diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cee8271d27014815b19175ef93759d6a07b7e73 --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -0,0 +1,329 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "glog/logging.h" + +namespace paddle { +namespace memory { +namespace detail { + +BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, + size_t min_chunk_size, size_t max_chunk_size) + : min_chunk_size_(min_chunk_size), + max_chunk_size_(max_chunk_size), + cache_(system_allocator->UseGpu()), + system_allocator_(std::move(system_allocator)) {} + +BuddyAllocator::~BuddyAllocator() { + VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; + while (!pool_.empty()) { + auto block = static_cast(std::get<2>(*pool_.begin())); + VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + pool_.erase(pool_.begin()); + } +} + +inline size_t align(size_t size, size_t alignment) { + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void* BuddyAllocator::Alloc(size_t unaligned_size) { + // adjust allocation alignment + size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); + + // acquire the allocator lock + std::lock_guard lock(mutex_); + + VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; + + // if the allocation is huge, send directly to the system allocator + if (size > max_chunk_size_) { + VLOG(10) << "Allocate from system allocator."; + return SystemAlloc(size); + } + + // query and allocate from the existing chunk + auto it = FindExistChunk(size); + + // refill the pool if failure + if (it == pool_.end()) { + it = RefillPool(); + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } + } else { + VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); + } + + total_used_ += size; + total_free_ -= size; + + // split the allocation and return data for use + return reinterpret_cast(SplitToAlloc(it, size))->data(); +} + +void BuddyAllocator::Free(void* p) { + // Point back to metadata + auto block = static_cast(p)->metadata(); + + // Acquire the allocator lock + std::lock_guard lock(mutex_); + + VLOG(10) << "Free from address " << block; + + if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { + VLOG(10) << "Free directly from system allocator"; + system_allocator_->Free(block, block->total_size(cache_), + block->index(cache_)); + + // Invalidate GPU allocation from cache + cache_.invalidate(block); + + return; + } + + block->mark_as_free(cache_); + + total_used_ -= block->total_size(cache_); + total_free_ += block->total_size(cache_); + + // Trying to merge the right buddy + if (block->has_right_buddy(cache_)) { + VLOG(10) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); + + auto right_buddy = block->right_buddy(cache_); + + if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(right_buddy->index(cache_), + right_buddy->total_size(cache_), + right_buddy)); + + // merge its right buddy to the block + block->merge(cache_, right_buddy); + } + } + + // Trying to merge the left buddy + if (block->has_left_buddy(cache_)) { + VLOG(10) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); + + auto left_buddy = block->left_buddy(cache_); + + if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(left_buddy->index(cache_), + left_buddy->total_size(cache_), left_buddy)); + + // merge the block to its left buddy + left_buddy->merge(cache_, block); + block = left_buddy; + } + } + + // Dumping this block into pool + VLOG(10) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; + pool_.insert( + IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); + + // Clean up if existing too much free memory + + // Prefer freeing fallback allocation first + CleanIdleFallBackAlloc(); + + // Free normal allocation + CleanIdleNormalAlloc(); +} + +size_t BuddyAllocator::Used() { return total_used_; } + +void* BuddyAllocator::SystemAlloc(size_t size) { + size_t index = 0; + void* p = system_allocator_->Alloc(index, size); + + VLOG(10) << "Allocated " << p << " from system allocator."; + + if (p == nullptr) return nullptr; + + static_cast(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); + + return static_cast(p)->data(); +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +#ifdef PADDLE_WITH_CUDA + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the maximum allocation size for the first allocation. + max_chunk_size_ = platform::GpuMaxChunkSize(); + } + } +#endif + + // Allocate a new maximum sized block + size_t index = 0; + void* p = system_allocator_->Alloc(index, max_chunk_size_); + + if (p == nullptr) return pool_.end(); + + VLOG(10) << "Creating and inserting new block " << p + << " from system allocator"; + + static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, + max_chunk_size_, nullptr, nullptr); + + // gpu fallback allocation + if (system_allocator_->UseGpu() && + static_cast(p)->index(cache_) == 1) { + fallback_alloc_count_++; + } + + total_free_ += max_chunk_size_; + + // dump the block into pool + return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { + size_t index = 0; + + while (1) { + auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr)); + + // no match chunk memory + if (it == pool_.end()) return it; + + if (std::get<0>(*it) > index) { + // find suitable one + if (std::get<1>(*it) >= size) { + return it; + } + // update and continue + index = std::get<0>(*it); + continue; + } + return it; + } +} + +void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, + size_t size) { + auto block = static_cast(std::get<2>(*it)); + pool_.erase(it); + + VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; + block->split(cache_, size); + + VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; + block->set_type(cache_, MemoryBlock::ARENA_CHUNK); + + // the rest of memory if exist + if (block->has_right_buddy(cache_)) { + if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { + VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; + + pool_.insert( + IndexSizeAddress(block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_))); + } + } + + return block; +} + +void BuddyAllocator::CleanIdleFallBackAlloc() { + // If fallback allocation does not exist, return directly + if (!fallback_alloc_count_) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + // If no GPU fallback allocator, return + if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { + return; + } + + VLOG(10) << "Return block " << block << " to fallback allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + fallback_alloc_count_--; + + // If no fall allocation exists, return directly + if (!fallback_alloc_count_) return; + } +} + +void BuddyAllocator::CleanIdleNormalAlloc() { + auto shall_free_alloc = [&]() -> bool { + // free all fallback allocations + if (fallback_alloc_count_ > 0) { + return true; + } + // keep 2x overhead if we haven't fallen back + if ((total_used_ + max_chunk_size_) * 2 < total_free_) { + return true; + } + return false; + }; + + if (!shall_free_alloc()) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + VLOG(10) << "Return block " << block << " to base allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + + if (!shall_free_alloc()) return; + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..644d79330680787f717920652708c0dd5bee1833 --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/detail/meta_cache.h" +#include "paddle/fluid/memory/detail/meta_data.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/gpu_info.h" + +#include +#include +#include +#include + +namespace paddle { +namespace memory { +namespace detail { + +class BuddyAllocator { + public: + BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size, + size_t max_chunk_size); + + ~BuddyAllocator(); + + public: + void* Alloc(size_t unaligned_size); + void Free(void* ptr); + size_t Used(); + + public: + // Disable copy and assignment + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + + private: + // Tuple (allocator index, memory size, memory address) + using IndexSizeAddress = std::tuple; + // Each element in PoolSet is a free allocation + using PoolSet = std::set; + + /*! \brief Allocate fixed-size memory from system */ + void* SystemAlloc(size_t size); + + /*! \brief If existing chunks are not suitable, refill pool */ + PoolSet::iterator RefillPool(); + + /** + * \brief Find the suitable chunk from existing pool and split + * it to left and right buddies + * + * \param it the iterator of pool list + * \param size the size of allocation + * + * \return the left buddy address + */ + void* SplitToAlloc(PoolSet::iterator it, size_t size); + + /*! \brief Find the existing chunk which used to allocation */ + PoolSet::iterator FindExistChunk(size_t size); + + /*! \brief Clean idle fallback allocation */ + void CleanIdleFallBackAlloc(); + + /*! \brief Clean idle normal allocation */ + void CleanIdleNormalAlloc(); + + private: + size_t total_used_ = 0; // the total size of used memory + size_t total_free_ = 0; // the total size of free memory + + size_t min_chunk_size_; // the minimum size of each chunk + size_t max_chunk_size_; // the maximum size of each chunk + + private: + /** + * \brief A list of free allocation + * + * \note Only store free chunk memory in pool + */ + PoolSet pool_; + + /*! Record fallback allocation count for auto-scaling */ + size_t fallback_alloc_count_ = 0; + + private: + /*! Unify the metadata format between GPU and CPU allocations */ + MetadataCache cache_; + + private: + /*! Allocate CPU/GPU memory from system */ + SystemAllocator* system_allocator_; + std::mutex mutex_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc new file mode 100644 index 0000000000000000000000000000000000000000..23388cdd5b7c44ff91e10aadaa8cc25d8ef29d14 --- /dev/null +++ b/paddle/fluid/memory/detail/memory_block.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/meta_cache.h" +#include "paddle/fluid/memory/detail/meta_data.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy) { + cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, + static_cast(left_buddy), + static_cast(right_buddy))); +} + +MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { + return cache.load(this).type; +} + +size_t MemoryBlock::size(MetadataCache& cache) const { + return cache.load(this).size; +} + +size_t MemoryBlock::total_size(MetadataCache& cache) const { + return cache.load(this).total_size; +} + +MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { + return cache.load(this).left_buddy; +} + +MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { + return cache.load(this).right_buddy; +} + +void MemoryBlock::split(MetadataCache& cache, size_t size) { + // make sure the split fits + PADDLE_ASSERT(total_size(cache) >= size); + + // bail out if there is no room for another partition + if (total_size(cache) - size <= sizeof(Metadata)) { + return; + } + + // find the position of the split + void* right_partition = reinterpret_cast(this) + size; + + size_t remaining_size = total_size(cache) - size; + + // Add the new block as a buddy + auto metadata = cache.load(this); + + // Write the metadata for the new block + auto new_block_right_buddy = metadata.right_buddy; + + cache.store( + static_cast(right_partition), + Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), + remaining_size, this, new_block_right_buddy)); + + metadata.right_buddy = static_cast(right_partition); + metadata.size = size - sizeof(Metadata); + metadata.total_size = size; + + cache.store(this, metadata); + + // Write metadata for the new block's right buddy + if (new_block_right_buddy != nullptr) { + auto buddy_metadata = cache.load(new_block_right_buddy); + + buddy_metadata.left_buddy = static_cast(right_partition); + + cache.store(new_block_right_buddy, buddy_metadata); + } +} + +void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { + // only free blocks can be merged + PADDLE_ASSERT(type(cache) == FREE_CHUNK); + PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); + + auto metadata = cache.load(this); + + // link this->buddy's buddy + metadata.right_buddy = right_buddy->right_buddy(cache); + + // link buddy's buddy -> this + if (metadata.right_buddy != nullptr) { + auto buddy_metadata = cache.load(metadata.right_buddy); + + buddy_metadata.left_buddy = this; + + cache.store(metadata.right_buddy, buddy_metadata); + } + + metadata.size += right_buddy->total_size(cache); + metadata.total_size += right_buddy->total_size(cache); + + cache.store(this, metadata); + cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); +} + +void MemoryBlock::mark_as_free(MetadataCache& cache) { + // check for double free or corruption + PADDLE_ASSERT(type(cache) != FREE_CHUNK); + PADDLE_ASSERT(type(cache) != INVALID_CHUNK); + + set_type(cache, FREE_CHUNK); +} + +void MemoryBlock::set_type(MetadataCache& cache, Type t) { + auto metadata = cache.load(this); + + metadata.type = t; + + cache.store(this, metadata); +} + +bool MemoryBlock::has_left_buddy(MetadataCache& cache) const { + return left_buddy(cache) != nullptr; +} + +bool MemoryBlock::has_right_buddy(MetadataCache& cache) const { + return right_buddy(cache) != nullptr; +} + +size_t MemoryBlock::index(MetadataCache& cache) const { + return cache.load(this).index; +} + +void* MemoryBlock::data() const { + return const_cast(reinterpret_cast(this)) + 1; +} + +MemoryBlock* MemoryBlock::metadata() const { + return const_cast(reinterpret_cast( + reinterpret_cast(this) - 1)); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h similarity index 100% rename from paddle/memory/detail/memory_block.h rename to paddle/fluid/memory/detail/memory_block.h diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d78811c7715b906aea1b88c13a4c3939db6387d --- /dev/null +++ b/paddle/fluid/memory/detail/meta_cache.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/meta_cache.h" +#include "glog/logging.h" +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} + +Metadata MetadataCache::load(const MemoryBlock* block) { + if (uses_gpu_) { + auto existing_metadata = cache_.find(block); + PADDLE_ASSERT(existing_metadata->second.check_guards()); + return existing_metadata->second; + } else { + auto* meta = reinterpret_cast(block); + VLOG(10) << "Load MetaData type=" << meta->type; + PADDLE_ASSERT(meta->check_guards()); + return *reinterpret_cast(block); + } +} + +void MetadataCache::store(MemoryBlock* block, + const Metadata& original_metadata) { + auto metadata = original_metadata; + + metadata.update_guards(); + + if (uses_gpu_) { + cache_[block] = metadata; + } else { + *reinterpret_cast(block) = metadata; + } +} + +void MetadataCache::invalidate(MemoryBlock* block) { + if (uses_gpu_) { + cache_.erase(block); + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..635d6398e697de80d0606a200c2634a93468199d --- /dev/null +++ b/paddle/fluid/memory/detail/meta_cache.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +/** + * \brief A cache for accessing memory block meta-data that may be expensive + * to access directly. + * + * \note This class exists to unify the metadata format between GPU and CPU + * allocations. It should be removed when the CPU can access all GPU + * allocations directly via UVM. + */ +class MetadataCache { + public: + explicit MetadataCache(bool uses_gpu); + + public: + /*! \brief Load the associated metadata for the specified memory block. */ + Metadata load(const MemoryBlock* memory_block); + + /*! \brief Store the associated metadata for the specified memory block. */ + void store(MemoryBlock* memory_block, const Metadata& meta_data); + + /*! \brief Indicate that the specified metadata will no longer be used. */ + void invalidate(MemoryBlock* memory_block); + + public: + MetadataCache(const MetadataCache&) = delete; + MetadataCache& operator=(const MetadataCache&) = delete; + + private: + bool uses_gpu_; + + private: + typedef std::unordered_map MetadataMap; + + private: + MetadataMap cache_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/meta_data.cc new file mode 100644 index 0000000000000000000000000000000000000000..eae49ebdcffd03eeb192fb7e859666027336245b --- /dev/null +++ b/paddle/fluid/memory/detail/meta_data.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, + MemoryBlock* l, MemoryBlock* r) + : type(t), + index(i), + size(s), + total_size(ts), + left_buddy(l), + right_buddy(r) {} + +Metadata::Metadata() + : type(MemoryBlock::INVALID_CHUNK), + index(0), + size(0), + total_size(0), + left_buddy(nullptr), + right_buddy(nullptr) {} + +template +inline void hash_combine(std::size_t& seed, const T& v) { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +inline size_t hash(const Metadata* metadata, size_t initial_seed) { + size_t seed = initial_seed; + + hash_combine(seed, (size_t)metadata->type); + hash_combine(seed, metadata->index); + hash_combine(seed, metadata->size); + hash_combine(seed, metadata->total_size); + hash_combine(seed, metadata->left_buddy); + hash_combine(seed, metadata->right_buddy); + + return seed; +} + +void Metadata::update_guards() { + guard_begin = hash(this, 1); + guard_end = hash(this, 2); +} + +bool Metadata::check_guards() const { + return guard_begin == hash(this, 1) && guard_end == hash(this, 2); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h new file mode 100644 index 0000000000000000000000000000000000000000..368523701ef1a7b3bd869e1f0542c42c61448b40 --- /dev/null +++ b/paddle/fluid/memory/detail/meta_data.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/detail/memory_block.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +class Metadata { + public: + Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, + MemoryBlock* r); + Metadata(); + + public: + /*! \brief Update the guards when metadata is changed */ + void update_guards(); + + /*! \brief Check consistency to previous modification */ + bool check_guards() const; + + public: + // TODO(gangliao): compress this + // clang-format off + size_t guard_begin = 0; + MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; + size_t index = 0; + size_t size = 0; + size_t total_size = 0; + MemoryBlock* left_buddy = nullptr; + MemoryBlock* right_buddy = nullptr; + size_t guard_end = 0; + // clang-format on +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f07c5e789c42997b3c75167a26a5b09875bd498 --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" + +#include // for malloc and free +#include // for mlock and munlock +#include // for std::max + +#include "gflags/gflags.h" + +// If use_pinned_memory is true, CPUAllocator calls mlock, which +// returns pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the amount +// of memory available to the system for paging. So, by default, we +// should set false to use_pinned_memory. +DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +DECLARE_double(fraction_of_gpu_memory_to_use); +namespace paddle { +namespace memory { +namespace detail { + +void* CPUAllocator::Alloc(size_t& index, size_t size) { + // According to http://www.cplusplus.com/reference/cstdlib/malloc/, + // malloc might not return nullptr if size is zero, but the returned + // pointer shall not be dereferenced -- so we make it nullptr. + if (size <= 0) return nullptr; + + index = 0; // unlock memory + + void* p; + +#ifdef PADDLE_WITH_MKLDNN + // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp + // memory alignment + PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0); +#else + PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0); +#endif + PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size); + + if (p != nullptr) { + if (FLAGS_use_pinned_memory) { + index = 1; + mlock(p, size); // lock memory + } + } + + return p; +} + +void CPUAllocator::Free(void* p, size_t size, size_t index) { + if (p != nullptr && index == 1) { + munlock(p, size); + } + free(p); +} + +bool CPUAllocator::UseGpu() const { return false; } + +#ifdef PADDLE_WITH_CUDA + +void* GPUAllocator::Alloc(size_t& index, size_t size) { + // CUDA documentation doesn't explain if cudaMalloc returns nullptr + // if size is 0. We just make sure it does. + if (size <= 0) return nullptr; + void* p; + cudaError_t result = cudaMalloc(&p, size); + if (result == cudaSuccess) { + index = 0; + gpu_alloc_size_ += size; + return p; + } else { + LOG(WARNING) + << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " + "environment variable to a lower value. Current value is " + << FLAGS_fraction_of_gpu_memory_to_use; + return nullptr; + } +} + +void GPUAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = cudaFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + } + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free."); + } +} + +bool GPUAllocator::UseGpu() const { return true; } + +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h similarity index 100% rename from paddle/memory/detail/system_allocator.h rename to paddle/fluid/memory/detail/system_allocator.h diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a850e480ec948b727980a8020df91958584aea02 --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/system_allocator.h" + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +DECLARE_bool(use_pinned_memory); + +void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { + bool freed = false; + { + size_t index; + void* p = a.Alloc(index, size); + if (size > 0) { + EXPECT_NE(p, nullptr); + } else { + EXPECT_EQ(p, nullptr); + } + + int* i = static_cast(p); + std::shared_ptr ptr(i, [&](void* p) { + freed = true; + a.Free(p, size, index); + }); + } + EXPECT_TRUE(freed); +} + +TEST(CPUAllocator, NoLockMem) { + FLAGS_use_pinned_memory = false; + paddle::memory::detail::CPUAllocator a; + TestAllocator(a, 2048); + TestAllocator(a, 0); +} + +TEST(CPUAllocator, LockMem) { + FLAGS_use_pinned_memory = true; + paddle::memory::detail::CPUAllocator a; + TestAllocator(a, 2048); + TestAllocator(a, 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(GPUAllocator, Alloc) { + paddle::memory::detail::GPUAllocator a; + TestAllocator(a, 2048); + TestAllocator(a, 0); +} +#endif diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc new file mode 100644 index 0000000000000000000000000000000000000000..8938b3613373a06620a6a0237b3de773c6421edd --- /dev/null +++ b/paddle/fluid/memory/memcpy.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" + +#include // for memcpy + +namespace paddle { +namespace memory { + +template <> +void Copy(platform::CPUPlace, void* dst, + platform::CPUPlace, + const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +#ifdef PADDLE_WITH_CUDA +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + } else { + platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, + stream); + } +} + +#endif + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h new file mode 100644 index 0000000000000000000000000000000000000000..77d209c3fbe8256bc94b3eca866f0f7e17a93325 --- /dev/null +++ b/paddle/fluid/memory/memcpy.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); + +#ifdef PADDLE_WITH_CUDA + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + cudaStream_t stream); + +#endif +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc new file mode 100644 index 0000000000000000000000000000000000000000..6eedab5d034192c071328b1be5c296227383287e --- /dev/null +++ b/paddle/fluid/memory/memory.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memory.h" + +#include "glog/logging.h" + +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); + +namespace paddle { +namespace memory { + +using BuddyAllocator = detail::BuddyAllocator; + +BuddyAllocator* GetCPUBuddyAllocator() { + static detail::BuddyAllocator* a = nullptr; + if (a == nullptr) { + a = new detail::BuddyAllocator(new detail::CPUAllocator, + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); + } + return a; +} + +template <> +void* Alloc(platform::CPUPlace place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + void* p = GetCPUBuddyAllocator()->Alloc(size); + VLOG(10) << " pointer=" << p; + return p; +} + +template <> +void Free(platform::CPUPlace place, void* p) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} + +template <> +size_t Used(platform::CPUPlace place) { + return GetCPUBuddyAllocator()->Used(); +} + +#ifdef PADDLE_WITH_CUDA + +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetCUDADeviceCount(); + as = new BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + as[gpu] = nullptr; + } + } + platform::SetDeviceId(gpu_id); + if (!as[gpu_id]) { + as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; + } + return as[gpu_id]; +} + +template <> +size_t Used(platform::CUDAPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); +} + +template <> +void* Alloc(platform::CUDAPlace place, size_t size) { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + auto* ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + int cur_dev = platform::GetCurrentDeviceId(); + platform::SetDeviceId(place.device); + size_t avail, total; + platform::GpuMemoryUsage(avail, total); + LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " + << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "total " << total; + LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); + LOG(WARNING) << "GPU memory used: " << Used(place); + platform::SetDeviceId(cur_dev); + } + return ptr; +} + +template <> +void Free(platform::CUDAPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); +} + +#endif + +size_t Usage::operator()(const platform::CPUPlace& cpu) const { + return Used(cpu); +} + +size_t Usage::operator()(const platform::CUDAPlace& gpu) const { +#ifdef PADDLE_WITH_CUDA + return Used(gpu); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +size_t memory_usage(const platform::Place& p) { + return boost::apply_visitor(Usage(), p); +} + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..a9166a6746e1985752ca18ffa7c429e5b35b55bb --- /dev/null +++ b/paddle/fluid/memory/memory.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { + +/** + * \brief Allocate memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] size Allocation size. + * + * \return Allocated memory block address. + * + * \note If return nullptr, it indicates memory allocation failed + * because insufficient memory in current system. When Alloc + * function is invoked, you must check the returned memory + * address is valid or not. + */ +template +void* Alloc(Place place, size_t size); + +/** + * \brief Free memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] ptr Memory block address to free. + * + */ +template +void Free(Place place, void* ptr); + +/** + * \brief Total size of used memory in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * + */ +template +size_t Used(Place place); + +struct Usage : public boost::static_visitor { + size_t operator()(const platform::CPUPlace& cpu) const; + size_t operator()(const platform::CUDAPlace& gpu) const; +}; + +size_t memory_usage(const platform::Place& p); + +/** + * \brief Free memory block in one place. + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PODDeleter { + static_assert(std::is_pod::value, "T must be POD"); + + public: + explicit PODDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, static_cast(ptr)); } + + private: + Place place_; +}; + +/** + * \brief Free memory block in one place does not meet POD + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PlainDeleter { + public: + explicit PlainDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, reinterpret_cast(ptr)); } + + private: + Place place_; +}; + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d7505ef0f36bc8765ba7634f286b67bccc6eacb6 --- /dev/null +++ b/paddle/fluid/memory/memory_test.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/meta_data.h" + +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +#include +#include + +inline bool is_aligned(void const *p) { + return 0 == (reinterpret_cast(p) & 0x3); +} + +size_t align(size_t size, paddle::platform::CPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::CpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +TEST(BuddyAllocator, CPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::CPUPlace cpu; + p = paddle::memory::Alloc(cpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::platform::Place place = cpu; + EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place)); + + paddle::memory::Free(cpu, p); +} + +TEST(BuddyAllocator, CPUMultAlloc) { + paddle::platform::CPUPlace cpu; + + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(cpu); + EXPECT_EQ(total_size, 0UL); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps[paddle::memory::Alloc(cpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(size, cpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p.first), true); + paddle::memory::Free(cpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(p.second, cpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); + } +} + +#ifdef PADDLE_WITH_CUDA + +size_t align(size_t size, paddle::platform::CUDAPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::GpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +TEST(BuddyAllocator, GPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::CUDAPlace gpu(0); + p = paddle::memory::Alloc(gpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::platform::Place place = gpu; + EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place)); + + paddle::memory::Free(gpu, p); +} + +TEST(BuddyAllocator, GPUMultAlloc) { + paddle::platform::CUDAPlace gpu; + + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(gpu); + EXPECT_EQ(total_size, 0UL); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps[paddle::memory::Alloc(gpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(size, gpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p.first), true); + paddle::memory::Free(gpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(p.second, gpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); + } +} + +#endif diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115 --- /dev/null +++ b/paddle/fluid/operators/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cadfd735d7b3feb473c308b04417f0a1e0f22249 --- /dev/null +++ b/paddle/fluid/operators/CMakeLists.txt @@ -0,0 +1,203 @@ +file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") +string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") +set(DEPS_OPS "") +set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h) +file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") +function(op_library TARGET) + # op_library is a function to create op library. The interface is same as + # cc_library. But it handle split GPU/CPU code and link some common library + # for ops. + set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) + set(cc_srcs) + set(cu_srcs) + set(cu_cc_srcs) + set(op_common_deps operator op_registry math_function) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(pybind_flag 0) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + list(LENGTH op_library_SRCS op_library_SRCS_len) + if (${op_library_SRCS_len} EQUAL 0) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND cc_srcs ${TARGET}.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND cu_cc_srcs ${TARGET}.cu.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${TARGET}.cu) + endif() + else() + foreach(src ${op_library_SRCS}) + if (${src} MATCHES ".*\\.cu$") + list(APPEND cu_srcs ${src}) + elseif(${src} MATCHES ".*\\.cu.cc$") + list(APPEND cu_cc_srcs ${src}) + elseif(${src} MATCHES ".*\\.cc$") + list(APPEND cc_srcs ${src}) + else() + message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") + endif() + endforeach() + endif() + + list(LENGTH cc_srcs cc_srcs_len) + if (${cc_srcs_len} EQUAL 0) + message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") + endif() + + list(LENGTH op_library_DEPS op_library_DEPS_len) + if (${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + endif() + if (WITH_GPU) + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + else() + cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + endif() + + # Define operators that don't need pybind here. + foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op") + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() + + # The registration of USE_OP, please refer to paddle/framework/op_registry.h. + # Note that it's enough to just adding one operator to pybind in a *_op.cc file. + # And for detail pybind information, please see generated paddle/pybind/pybind.h. + file(READ ${TARGET}.cc TARGET_CONTENT) + string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}") + string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}") + if (one_register STREQUAL "") + string(REPLACE "_op" "" TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}") + string(REPLACE "," "" TARGET "${TARGET}") + endif() + + # pybind USE_NO_KERNEL_OP + # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel + string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") + string(REPLACE "_op" "" TARGET "${TARGET}") + if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") + file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") + set(pybind_flag 1) + endif() + + # pybind USE_CPU_ONLY_OP + list(LENGTH cu_srcs cu_srcs_len) + list(LENGTH cu_cc_srcs cu_cc_srcs_len) + if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) + file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") + set(pybind_flag 1) + endif() + + # pybind USE_OP + if (${pybind_flag} EQUAL 0) + file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") + endif() +endfunction() + +add_subdirectory(math) +add_subdirectory(nccl) + +if(WITH_GPU) + op_library(nccl_op DEPS nccl_common) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") +else() + set(DEPS_OPS ${DEPS_OPS} nccl_op) +endif() + +if(WITH_DISTRIBUTE) + add_subdirectory(detail) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) +else() + set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) +endif() + +op_library(cond_op DEPS framework_proto tensor net_op) +op_library(cross_entropy_op DEPS cross_entropy) +op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +op_library(softmax_op DEPS softmax) +op_library(detection_output_op DEPS softmax) +op_library(sequence_softmax_op DEPS softmax) +op_library(sum_op DEPS selected_rows_functor) +op_library(sgd_op DEPS selected_rows_functor) +op_library(print_op DEPS lod_tensor) +op_library(adagrad_op DEPS selected_rows_functor) +op_library(maxout_op DEPS maxouting) +op_library(unpool_op DEPS unpooling) +op_library(pool_with_index_op DEPS pooling) +op_library(lod_rank_table_op DEPS lod_rank_table) +op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) +op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) +op_library(max_sequence_len_op DEPS lod_rank_table) +op_library(sequence_conv_op DEPS context_project) +op_library(sequence_pool_op DEPS sequence_pooling) +op_library(lstm_op DEPS sequence2batch lstm_compute) +op_library(lstmp_op DEPS sequence2batch lstm_compute) +op_library(gru_op DEPS sequence2batch gru_compute) +op_library(recurrent_op DEPS executor) +op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) +op_library(cos_sim_op DEPS cos_sim_functor) +op_library(parallel_do_op DEPS executor) +op_library(create_reader_op DEPS reader) + +# Regist multiple Kernel to pybind +if (WITH_GPU) + +op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS + vol2col depthwise_conv) + +op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) +op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) +op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc + conv_transpose_cudnn_op.cu.cc DEPS vol2col) +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n") +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n") +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n") +else() +op_library(conv_op SRCS conv_op.cc DEPS vol2col) +op_library(pool_op SRCS pool_op.cc DEPS pooling) +op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col) +endif() + +# FIXME(typhoonzero): save/load depends lodtensor serialization functions +op_library(save_op DEPS lod_tensor) +op_library(load_op DEPS lod_tensor) +op_library(save_combine_op DEPS lod_tensor) +op_library(load_combine_op DEPS lod_tensor) + +list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) +foreach(src ${GENERAL_OPS}) + op_library(${src}) +endforeach() +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n") + +set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") + +cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) +cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) +cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) +cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) +cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) +if(WITH_GPU) + cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) +endif() +cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) +cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) diff --git a/paddle/fluid/operators/accuracy_op.cc b/paddle/fluid/operators/accuracy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..43689b3b7da5a0f5157ec8bc5fcf19d643ddc4ca --- /dev/null +++ b/paddle/fluid/operators/accuracy_op.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/accuracy_op.h" + +namespace paddle { +namespace operators { + +class AccuracyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), + "Input (Out) of accuracy op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input (Indices) of accuracy op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input (Label) of accuracy op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Accuracy"), + "Output (Accuracy) of AccuracyOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Correct"), + "Output (Correct) of AccuracyOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Total"), + "Output (Total) of AccuracyOp should not be null."); + + auto inference_dim = ctx->GetInputDim("Out"); + auto label_dim = ctx->GetInputDim("Label"); + // Assume indices has same shape as inference, because + // it's the output of topk. + + PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2."); + PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1"); + PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0], + "the inference tensor's num_rows must be" + " the same as label."); + + ctx->SetOutputDim("Accuracy", {1}); + ctx->SetOutputDim("Correct", {1}); + ctx->SetOutputDim("Total", {1}); + ctx->ShareLoD("Out", /*->*/ "Accuracy"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Out")->type()), + ctx.GetPlace()); + } +}; + +class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + // TODO(typhoonzero): support both inference value and indices. + AddInput("Out", "The network output of topk (inferences)"); + AddInput("Indices", "The the network output of topk (indices)"); + AddInput("Label", "Label of the training data"); + // TODO(typhoonzero): AddInput("Weight", ... + AddOutput("Accuracy", "The accuracy of current batch"); + AddOutput("Correct", "The correct samples count of current batch"); + AddOutput("Total", "The samples count of current batch"); + + AddComment(R"DOC( +Accuracy Operator. + +It will print accuracy rate for classification. +The accuracy is calculated as follows: + +$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$ + +Both the input Out and Label can carry the LoD (Level of Details) +information, or not. But the output only shares the LoD information +with the input Out(Inference). + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, + paddle::framework::EmptyGradOpMaker); +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. +REGISTER_OP_CPU_KERNEL(accuracy, + ops::AccuracyKernel, + ops::AccuracyKernel); diff --git a/paddle/fluid/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4462b9ba5c0e902933c53130f72fe40f807bde4a --- /dev/null +++ b/paddle/fluid/operators/accuracy_op.cu @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/operators/accuracy_op.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void AccuracyCudaKernel(const int N, const int D, + const int64_t* Xdata, + const int64_t* labeldata, int* correct_data, + float* accuracy, int* total_data) { + int count = 0; + __shared__ int total[BlockSize]; + + // support only 1 block + for (int i = threadIdx.x; i < (N); i += BlockSize) { + for (int j = 0; j < D; ++j) { + if (Xdata[i * D + j] == labeldata[i]) { + ++count; + break; + } + } + } + total[threadIdx.x] = count; + __syncthreads(); + + // reduce the count with init value 0, and output accuracy. + int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); + if (threadIdx.x == 0) { + *correct_data = result; + *accuracy = static_cast(result) / static_cast(N); + *total_data = N; + } +} + +template +class AccuracyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* inference = ctx.Input("Out"); + auto* indices = ctx.Input("Indices"); + auto* label = ctx.Input("Label"); + + auto* accuracy = ctx.Output("Accuracy"); + auto* correct = ctx.Output("Correct"); + auto* total = ctx.Output("Total"); + // FIXME(typhoonzero): only support indices currently + // if add support for output values, how to detect the data type? + const int64_t* indices_data = indices->data(); + const int64_t* label_data = label->data(); + + int* correct_data = correct->mutable_data(ctx.GetPlace()); + int* total_data = total->mutable_data(ctx.GetPlace()); + float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); + + int num_samples = static_cast(inference->dims()[0]); + size_t infer_width = inference->dims()[1]; + auto stream = ctx.cuda_device_context().stream(); + platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); + + if (num_samples == 0) { + return; + } + + AccuracyCudaKernel< + PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + num_samples, infer_width, indices_data, label_data, correct_data, + accuracy_data, total_data); + } +}; + +} // namespace operators +} // namespace paddle + +// FIXME(typhoonzero): types of T is for inference data. +// label data is always int64 +REGISTER_OP_CUDA_KERNEL(accuracy, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/accuracy_op.h b/paddle/fluid/operators/accuracy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b3ed1d3fe09ba044142ed69a463918d7e03a78e9 --- /dev/null +++ b/paddle/fluid/operators/accuracy_op.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class AccuracyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* inference = ctx.Input("Out"); + auto* indices = ctx.Input("Indices"); + auto* label = ctx.Input("Label"); + auto* accuracy = ctx.Output("Accuracy"); + auto* correct = ctx.Output("Correct"); + auto* total = ctx.Output("Total"); + + int* correct_data = correct->mutable_data(ctx.GetPlace()); + int* total_data = total->mutable_data(ctx.GetPlace()); + float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); + + const int64_t* indices_data = indices->data(); + const int64_t* label_data = label->data(); + + size_t num_samples = inference->dims()[0]; + size_t class_dim = inference->dims()[1]; + *accuracy_data = 0.0f; + + if (num_samples == 0) { + return; + } + + int num_correct = 0; + // assume inference is already the topk of the output + for (size_t i = 0; i < num_samples; ++i) { + PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0"); + for (size_t j = 0; j < class_dim; ++j) { + if (indices_data[i * class_dim + j] == label_data[i]) { + ++num_correct; + break; + } + } + } + + *correct_data = num_correct; + *total_data = num_samples; + *accuracy_data = + static_cast(num_correct) / static_cast(num_samples); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c04dd8cb9163cf7b05fd09bcf7f1d2937368614f --- /dev/null +++ b/paddle/fluid/operators/activation_op.cc @@ -0,0 +1,615 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/activation_op.h" + +namespace paddle { +namespace operators { + +class ActivationOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ActivationOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } +}; + +class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Sigmoid operator"); + AddOutput("Out", "Output of Sigmoid operator"); + AddComment(R"DOC( +Sigmoid Activation Operator + +$$out = \frac{1}{1 + e^{-x}}$$ + +)DOC"); + } +}; + +class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of LogSigmoid operator"); + AddOutput("Out", "Output of LogSigmoid operator"); + AddComment(R"DOC( +Logsigmoid Activation Operator + +$$out = \log \frac{1}{1 + e^{-x}}$$ + +)DOC"); + } +}; + +class ExpOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Exp operator"); + AddOutput("Out", "Output of Exp operator"); + AddComment(R"DOC( +Exp Activation Operator. + +$out = e^x$ + +)DOC"); + } +}; + +class ReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Relu operator"); + AddOutput("Out", "Output of Relu operator"); + AddComment(R"DOC( +Relu Activation Operator. + +$out = \max(x, 0)$ + +)DOC"); + } +}; + +class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of LeakyRelu operator"); + AddOutput("Out", "Output of LeakyRelu operator"); + AddAttr("alpha", "The small negative slope").SetDefault(0.02f); + AddComment(R"DOC( +LeakyRelu Activation Operator. + +$out = \max(x, \alpha * x)$ + +)DOC"); + } +}; + +class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softshrink operator"); + AddOutput("Out", "Output of Softshrink operator"); + AddAttr("lambda", "non-negative offset").SetDefault(0.5f); + AddComment(R"DOC( +Softshrink Activation Operator. + +$$ +out = \begin{cases} + x - \lambda, \text{if } x > \lambda \\ + x + \lambda, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} +$$ + +)DOC"); + } +}; + +class TanhOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Tanh operator"); + AddOutput("Out", "Output of Tanh operator"); + AddComment(R"DOC( +Tanh Activation Operator. + +$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ + +)DOC"); + } +}; + +class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of TanhShrink operator"); + AddOutput("Out", "Output of TanhShrink operator"); + AddComment(R"DOC( +TanhShrink Activation Operator. + +$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ + +)DOC"); + } +}; + +class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of HardShrink operator"); + AddOutput("Out", "Output of HardShrink operator"); + AddAttr("threshold", "The value of threshold for HardShrink") + .SetDefault(0.5f); + AddComment(R"DOC( +HardShrink Activation Operator. + +$$ +out = \begin{cases} + x, \text{if } x > \lambda \\ + x, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} +$$ + +)DOC"); + } +}; + +class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Sqrt operator"); + AddOutput("Out", "Output of Sqrt operator"); + AddComment(R"DOC( +Sqrt Activation Operator. + +$out = \sqrt{x}$ + +)DOC"); + } +}; + +class AbsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Abs operator"); + AddOutput("Out", "Output of Abs operator"); + AddComment(R"DOC( +Abs Activation Operator. + +$out = |x|$ + +)DOC"); + } +}; + +class CeilOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Ceil operator"); + AddOutput("Out", "Output of Ceil operator"); + AddComment(R"DOC( +Ceil Activation Operator. + +$out = ceil(x)$ + +)DOC"); + } +}; + +class FloorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Floor operator"); + AddOutput("Out", "Output of Floor operator"); + AddComment(R"DOC( +Floor Activation Operator. + +$out = floor(x)$ + +)DOC"); + } +}; + +class RoundOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Round operator"); + AddOutput("Out", "Output of Round operator"); + AddComment(R"DOC( +Round Activation Operator. + +$out = [x]$ + +)DOC"); + } +}; + +class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Reciprocal operator"); + AddOutput("Out", "Output of Reciprocal operator"); + AddComment(R"DOC( +Reciprocal Activation Operator. + +$$out = \frac{1}{x}$$ + +)DOC"); + } +}; + +class LogOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Log operator"); + AddOutput("Out", "Output of Log operator"); + AddComment(R"DOC( +Log Activation Operator. + +$out = \ln(x)$ + +Natural logarithm of x. + +)DOC"); + } +}; + +class SquareOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Square operator"); + AddOutput("Out", "Output of Square operator"); + AddComment(R"DOC( +Square Activation Operator. + +$out = x^2$ + +)DOC"); + } +}; + +class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softplus operator"); + AddOutput("Out", "Output of Softplus operator"); + AddComment(R"DOC( +Softplus Activation Operator. + +$out = \ln(1 + e^{x})$ + +)DOC"); + } +}; + +class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softsign operator"); + AddOutput("Out", "Output of Softsign operator"); + AddComment(R"DOC( +Softsign Activation Operator. + +$$out = \frac{x}{1 + |x|}$$ + +)DOC"); + } +}; + +class BReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of BRelu operator"); + AddOutput("Out", "Output of BRelu operator"); + AddAttr("t_min", "The min marginal value of BRelu") + .SetDefault(static_cast(0)); + AddAttr("t_max", "The max marginal value of BRelu") + .SetDefault(static_cast(24)); + AddComment(R"DOC( +BRelu Activation Operator. + +$out = \max(\min(x, t_{min}), t_{max})$ + +)DOC"); + } +}; + +class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of SoftRelu operator"); + AddOutput("Out", "Output of SoftRelu operator"); + AddAttr("threshold", "The threshold value of SoftRelu") + .SetDefault(40.0f); + AddComment(R"DOC( +SoftRelu Activation Operator. + +$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$ + +)DOC"); + } +}; + +class ELUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of ELU operator"); + AddOutput("Out", "Output of ELU operator"); + AddAttr("alpha", "The alpha value of ELU").SetDefault(1.0f); + AddComment(R"DOC( +ELU Activation Operator. + +Applies the following element-wise computation on the input according to +https://arxiv.org/abs/1511.07289. + +$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$ + +)DOC"); + } +}; + +class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { + public: + Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Relu6 operator"); + AddOutput("Out", "Output of Relu6 operator"); + AddAttr("threshold", "The threshold value of Relu6") + .SetDefault(6.0f); + AddComment(R"DOC( +Relu6 Activation Operator. + +$out = \min(\max(0, x), 6)$ + +)DOC"); + } +}; + +class PowOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PowOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Pow operator"); + AddOutput("Out", "Output of Pow operator"); + AddAttr("factor", "The exponential factor of Pow").SetDefault(1.0f); + AddComment(R"DOC( +Pow Activation Operator. + +$out = x^{factor}$ + +)DOC"); + } +}; + +class STanhOpMaker : public framework::OpProtoAndCheckerMaker { + public: + STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of STanh operator"); + AddOutput("Out", "Output of STanh operator"); + AddAttr("scale_a", "The scale parameter of a for the input") + .SetDefault(2.0f / 3.0f); + AddAttr("scale_b", "The scale parameter of b for the input") + .SetDefault(1.7159f); + AddComment(R"DOC( +STanh Activation Operator. + +$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ + +)DOC"); + } +}; + +class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of ThresholdedRelu operator"); + AddOutput("Out", "Output of ThresholdedRelu operator"); + AddAttr("threshold", "The threshold location of activation") + .SetDefault(1.0f); + AddComment(R"DOC( +ThresholdedRelu Activation Operator. + +$$ +out = \begin{cases} + x, \text{if } x > threshold \\ + 0, \text{otherwise} + \end{cases} +$$ + +)DOC"); + } +}; + +class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of HardSigmoid operator"); + AddOutput("Out", "Output of HardSigmoid operator"); + AddAttr("slope", "Slope for linear approximation of sigmoid") + .SetDefault(0.2f); + AddAttr("offset", "Offset for linear approximation of sigmoid") + .SetDefault(0.5f); + AddComment(R"DOC( +HardSigmoid Activation Operator. + +Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), +which is much faster than sigmoid. + +$out = \max(0, \min(1, slope * x + shift))$ + +The slope should be positive. The offset can be either positive or negative. +The default slope and shift are set according to the above reference. +It is recommended to use the defaults for this activation. + +)DOC"); + } +}; + +class SwishOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Swish operator"); + AddOutput("Out", "Output of Swish operator"); + AddAttr("beta", "Constant beta of swish operator").SetDefault(1.0f); + AddComment(R"DOC( +Swish Activation Operator. + +$$out = \frac{x}{1 + e^{- \beta x}}$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad, + ops::ActivationOpGrad); + +REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, + logsigmoid_grad, ops::ActivationOpGrad); + +REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, + ops::ActivationOpGrad); + +REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, + ops::ActivationOpGrad); + +REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, + tanh_shrink_grad, ops::ActivationOpGrad); + +REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, + softshrink_grad, ops::ActivationOpGrad); + +REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, + ops::ActivationOpGrad); + +REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, + ops::ActivationOpGrad); + +REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, + ops::ActivationOpGrad); + +REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad, + ops::ActivationOpGrad); + +REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad, + ops::ActivationOpGrad); + +REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker, + reciprocal_grad, ops::ActivationOpGrad); + +REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad, + ops::ActivationOpGrad); + +REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad, + ops::ActivationOpGrad); + +REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad, + ops::ActivationOpGrad); + +REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad, + ops::ActivationOpGrad); + +REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker, + leaky_relu_grad, ops::ActivationOpGrad); + +REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad, + ops::ActivationOpGrad); + +REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad, + ops::ActivationOpGrad); + +REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad, + ops::ActivationOpGrad); + +REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker, + hard_shrink_grad, ops::ActivationOpGrad); + +REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker, + thresholded_relu_grad, ops::ActivationOpGrad); + +REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, + hard_sigmoid_grad, ops::ActivationOpGrad); + +REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, + ops::ActivationOpGrad); + +#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL( \ + act_type, ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b86a7926a978b987d9dbb51fba55e025aab5e7fd --- /dev/null +++ b/paddle/fluid/operators/activation_op.cu @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/activation_op.h" + +namespace ops = paddle::operators; + +#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7a6ae2224c84cc17223f43046f06f08d11451439 --- /dev/null +++ b/paddle/fluid/operators/activation_op.h @@ -0,0 +1,799 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class ActivationKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + auto& X = detail::Ref(context.Input("X"), + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + + auto& Out = detail::Ref(context.Output("Out"), + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + Out.mutable_data(context.GetPlace()); + auto x = framework::EigenVector::Flatten(X); + auto out = framework::EigenVector::Flatten(Out); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(*place, x, out); + } +}; + +template +class ActivationGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Out = context.Input("Out"); + auto* dOut = + context.Input(framework::GradVarName("Out")); + auto* dX = context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + auto dout = framework::EigenVector::Flatten(*dOut); + auto x = framework::EigenVector::Flatten(*X); + auto out = framework::EigenVector::Flatten(*Out); + auto dx = framework::EigenVector::Flatten(*dX); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(*place, x, out, dout, dx); + } +}; + +template +struct BaseActivationFunctor { + using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } +}; + +// sigmoid(x) = 1 / (1 + exp(-x)) +template +struct SigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); + } +}; + +template +struct SigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out * (static_cast(1) - out); + } +}; + +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } +}; + +// exp(x) = e^x +template +struct ExpFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.exp(); + } +}; + +template +struct ExpGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out; + } +}; + +// relu(x) = max(x, 0) +template +struct ReluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)); + } +}; + +template +struct ReluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x > static_cast(0)).template cast(); + } +}; + +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +struct TanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) - out * out); + } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhShrinkFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x - x.tanh(); + } +}; + +template +struct TanhShrinkGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x.tanh() * x.tanh()); + } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out) const { + auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); + auto temp2 = (x > static_cast(threshold)).template cast().eval(); + out.device(d) = x * (temp1 + temp2); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); + auto temp2 = (x > static_cast(threshold)).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } +}; + +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast().eval(); + auto temp2 = (x < -lambdaT).template cast().eval(); + out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast().eval(); + auto temp2 = (x < -lambdaT).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } +}; + +// sqrt(x) = x^(1/2) +template +struct SqrtFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.sqrt(); + } +}; + +template +struct SqrtGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + const Out out_conj = Eigen::numext::conj(out); + dx.device(d) = static_cast(0.5) * dout / out_conj; + } +}; + +// ceil(x) = ceiling(x) +template +struct CeilFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.ceil(); + } +}; + +template +struct ZeroGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(0) / x; + } +}; + +// floor(x) = flooring(x) +template +struct FloorFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.floor(); + } +}; + +// round(x) = [x] +template +struct RoundFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.round(); + } +}; + +// abs(x) = |x| +template +struct AbsFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.abs(); + } +}; + +template +struct AbsGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.sign(); + } +}; + +// reciprocal(x) = 1 / x +template +struct ReciprocalFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / x; + } +}; + +template +struct ReciprocalGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(-1) * out * out; + } +}; + +// log(x) = natural logarithm of x +template +struct LogFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.log(); + } +}; + +template +struct LogGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) / x); + } +}; + +// square(x) = x^2 +template +struct SquareFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.square(); + } +}; + +template +struct SquareGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(2) * x; + } +}; + +template +struct BReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` + // not polymorphism for speed. + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); + } +}; + +template +struct BReluGradFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); + } +}; + +// relu6(x) = min(max(0, x), 6) +template +struct Relu6Functor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(0)).cwiseMin(static_cast(threshold)); + } +}; + +template +struct Relu6GradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(0)) * (x < static_cast(threshold))) + .template cast(); + } +}; + +// softplus(x) = log(1 + exp(x)) +// When x is a very large positive number, exp(x) may explode to inf, +// Using trick below for numerical stability +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0))) +template +struct SoftplusFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log()); + } +}; + +// d(softplus(x))/dx = exp(x) / (1 + exp(x)) +// For numerical stability: +// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) + +// exp(x - max(x, 0))) +template +struct SoftplusGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + dx.device(d) = + dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp())); + } +}; + +// softsign(x) = x / (1 + |x|) +template +struct SoftsignFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) { + out.device(d) = x / (static_cast(1) + x.abs()); + } +}; + +// d(softsign(x))/dx = 1 / (1 + |x|)^2 +// Taken from https://en.wikipedia.org/wiki/Activation_function +template +struct SoftsignGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) { + dx.device(d) = + dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); + } +}; + +template +struct SoftReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto tmp = static_cast(threshold); + auto temp = x.cwiseMax(-tmp).cwiseMin(tmp); + out.device(d) = (static_cast(1) + temp.exp()).log(); + } +}; + +template +struct SoftReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto tmp = static_cast(threshold); + auto temp = ((x > -tmp) * (x < tmp)).template cast().eval(); + dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; + } +}; + +template +struct LeakyReluFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(alpha) * x); + } +}; + +template +struct LeakyReluGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(alpha) * + (x < static_cast(0)).template cast().eval(); + auto temp2 = (x >= static_cast(0)).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } +}; + +template +struct ELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)) + + (static_cast(alpha) * (x.exp() - static_cast(1))) + .cwiseMin(static_cast(0)); + } +}; + +template +struct ELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x > static_cast(0)).template cast() + + dout * (out + static_cast(alpha)) * + (x < static_cast(0)).template cast(); + } +}; + +// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 +template +struct PowFunctor : public BaseActivationFunctor { + float factor; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"factor", &factor}}; + } + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.pow(static_cast(factor)); + } +}; + +template +struct PowGradFunctor : public BaseActivationFunctor { + float factor; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"factor", &factor}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(factor) * + x.pow(static_cast(factor - static_cast(1))); + } +}; + +template +struct STanhFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + static_cast(scale_b) * (static_cast(scale_a) * x).tanh(); + } +}; + +template +struct STanhGradFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto a = static_cast(scale_a); + auto b = static_cast(scale_b); + auto temp = (a * x).tanh() * (a * x).tanh(); + dx.device(d) = dout * a * b * (static_cast(1) - temp); + } +}; + +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto th = static_cast(threshold); + out.device(d) = (x > th).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto th = static_cast(threshold); + dx.device(d) = dout * (x > th).template cast(); + } +}; + +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto temp = x * static_cast(slope) + static_cast(offset); + out.device(d) = + temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((out > static_cast(0)) * (out < static_cast(1))) + .template cast() * + static_cast(slope); + } +}; + +template +struct SwishFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x / (static_cast(1) + (static_cast(-beta) * x).exp()); + } +}; + +template +struct SwishGradFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) / + (static_cast(1) + (static_cast(-beta) * x).exp()); + auto temp2 = temp1 * (static_cast(1) - (beta * out)); + dx.device(d) = dout * ((beta * out) + temp2); + } +}; + +} // namespace operators +} // namespace paddle + +#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ + __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ + __macro(exp, ExpFunctor, ExpGradFunctor); \ + __macro(relu, ReluFunctor, ReluGradFunctor); \ + __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ + __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ + __macro(abs, AbsFunctor, AbsGradFunctor); \ + __macro(ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, FloorFunctor, ZeroGradFunctor); \ + __macro(round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log, LogFunctor, LogGradFunctor); \ + __macro(square, SquareFunctor, SquareGradFunctor); \ + __macro(brelu, BReluFunctor, BReluGradFunctor); \ + __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(pow, PowFunctor, PowGradFunctor); \ + __macro(stanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ + __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ + __macro(elu, ELUFunctor, ELUGradFunctor); \ + __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \ + __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \ + __macro(swish, SwishFunctor, SwishGradFunctor); \ + __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ececd47e6a6787a161405fec75dafda336fddfbf --- /dev/null +++ b/paddle/fluid/operators/adadelta_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adadelta_op.h" + +namespace paddle { +namespace operators { + +class AdadeltaOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdadeltaOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdadeltaOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"), + "Input(AvgSquaredGrad) of AdadeltaOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), + "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdadeltaOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AvgSquaredGradOut"), + "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AvgSquaredUpdateOut"), + "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "param and grad input of AdadeltaOp should have same dimension"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), + "Param and AvgSquaredGrad input of AdadeltaOp " + "should have same dimension"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), + "Param and AvgSquaredUpdate input of AdadeltaOp " + "should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("AvgSquaredGradOut", param_dim); + ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); + } +}; + +class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient"); + AddInput("AvgSquaredUpdate", + "(Tensor) Input average of squared parameter updates"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("AvgSquaredGradOut", + "(Tensor) Output average of squared gradient"); + AddOutput("AvgSquaredUpdateOut", + "(Tensor) Output average of squared parameter updates"); + + AddAttr("rho", + "(float, default 0.95) Exponential decay rate " + "for squared gradients.") + .SetDefault(0.95f); + AddAttr("epsilon", + "(float, default 1.0e-6) Constant for " + "numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( +Adadelta Optimizer. + +Adadelta optimizer is implemented as explained in: +https://arxiv.org/abs/1212.5701 +Adadelta is a per-dimension adaptive learning rate method used +for gradient descent. + +Adadelta updates are as follows: + +$$ +avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\ +param\_update = - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\ +avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\ +param\_out = param + param\_update +$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); +REGISTER_OP_CPU_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/fluid/operators/adadelta_op.cu b/paddle/fluid/operators/adadelta_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..733482f788df8dfe1224ebe0d4494111bf9f647b --- /dev/null +++ b/paddle/fluid/operators/adadelta_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adadelta_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h new file mode 100644 index 0000000000000000000000000000000000000000..82ced08710448293fb91a4fb0dea7ab216cd3da6 --- /dev/null +++ b/paddle/fluid/operators/adadelta_op.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class AdadeltaOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto avg_squared_grad_out_tensor = + ctx.Output("AvgSquaredGradOut"); + auto avg_squared_update_out_tensor = + ctx.Output("AvgSquaredUpdateOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + avg_squared_grad_out_tensor->mutable_data(ctx.GetPlace()); + avg_squared_update_out_tensor->mutable_data(ctx.GetPlace()); + + T rho = static_cast(ctx.Attr("rho")); + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + // Squared gradient accumulator + auto avg_squared_grad = framework::EigenVector::Flatten( + *ctx.Input("AvgSquaredGrad")); + // Squared updates accumulator + auto avg_squared_update = framework::EigenVector::Flatten( + *ctx.Input("AvgSquaredUpdate")); + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto avg_squared_grad_out = + framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); + auto avg_squared_update_out = + framework::EigenVector::Flatten(*avg_squared_update_out_tensor); + auto& place = *ctx.template device_context().eigen_device(); + + avg_squared_grad_out.device(place) = + rho * avg_squared_grad + (1 - rho) * grad.square(); + auto update = + -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon)) + .sqrt() * + grad; + avg_squared_update_out.device(place) = + rho * avg_squared_update + (1 - rho) * update.square(); + param_out.device(place) = param + update; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..61c0ecd019b1d7811ee5cfd4b43358bdb0fba3d9 --- /dev/null +++ b/paddle/fluid/operators/adagrad_op.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adagrad_op.h" + +#include + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +class AdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of AdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdagradOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of AdagradOp should have the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( + +Adaptive Gradient Algorithm (Adagrad). + +The update is done as follows: + +$$moment\_out = moment + grad * grad \\ +param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} +$$ + +The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) +does not have the epsilon attribute. It is added here in our implementation +as also proposed here: http://cs231n.github.io/neural-networks-3/#ada +for numerical stability to avoid the division by zero error. + +)DOC"); + } +}; + +namespace { +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_width = grad.value().dims()[1]; + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto& merge_rows = grad_merge.rows(); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); + + // 2. m += g_m * g_m + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); + + math::SelectedRowsAddToTensor functor; + functor(context, grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + for (size_t i = 0; i < merge_rows.size(); i++) { + for (int64_t j = 0; j < grad_width; j++) { + param_data[merge_rows[i] * grad_width + j] -= + lr[0] * grad_merge_data[i * grad_width + j] / + (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon); + } + } + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1117363c133fe02c0c6b0a563d0b3665efb7fb18 --- /dev/null +++ b/paddle/fluid/operators/adagrad_op.cu @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adagrad_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows, + T* grad_merge, const int64_t* grad_merge_rows, + size_t grad_merge_rows_size, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t grad_merge_idx; + + if (tid == 0) { + for (size_t i = 0; i < grad_merge_rows_size; i++) { + if (grad_rows[ty] == grad_merge_rows[i]) { + grad_merge_idx = i; + } + } + } + + __syncthreads(); + + grad += ty * row_numel; + grad_merge += grad_merge_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]); + } +} + +template +__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, + const T* learning_rate, T* param, + T* moment, int64_t row_numel, + T epsilon) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + grad += ty * row_numel; + param += rows[ty] * row_numel; + moment += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(param + index, + -1.0 * learning_rate[0] * grad[index] / + (sqrt(moment[index]) + epsilon)); + } +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_width = grad.value().dims()[1]; + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); + framework::Vector merge_rows(grad_merge.rows()); + // 2. m += g_m * g_m + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); + + math::SelectedRowsAddToTensor functor; + functor(context, grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid2(1, merge_rows.size()); + SparseAdagradFunctorKernel< + T, 256><<(context) + .stream()>>>( + grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr, + param_data, moment_data, grad_width, epsilon); + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ee503b2c36299c7550f6679fe6e4bca7c33c8eee --- /dev/null +++ b/paddle/fluid/operators/adagrad_op.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +struct SparseAdagradFunctor { + void operator()(const DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param); +}; + +template +class AdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out_tensor = ctx.Output("ParamOut"); + auto* moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto* learning_rate = ctx.Input("LearningRate"); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto* place = ctx.template device_context().eigen_device(); + + moment_out.device(*place) = moment + grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + if (platform::is_cpu_place(ctx.GetPlace())) { + auto* lr = learning_rate->data(); + param_out.device(*place) = + param - lr[0] * grad / (moment_out.sqrt() + epsilon); + } else { + auto lr = framework::EigenVector::Flatten(*learning_rate); + param_out.device(*place) = + param - + lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } + } else if (grad_var->IsType()) { + auto* param_tensor = ctx.Input("Param"); + PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); + + auto* moment_tensor = ctx.Input("Moment"); + PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); + + SparseAdagradFunctor functor; + functor(ctx.template device_context(), + *ctx.Input("Grad"), + *ctx.Input("LearningRate"), epsilon, + moment_out_tensor, param_out_tensor); + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..25da9336b28ca8d14bf01ce8ca13bd8b379e9b10 --- /dev/null +++ b/paddle/fluid/operators/adam_op.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adam_op.h" + +namespace paddle { +namespace operators { + +class AdamOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment1"), + "Input(Moment1) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment2"), + "Input(Moment2) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + "Input(Beta2Pow) of AdamOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + "Output(Moment1Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + "Output(Moment2Out) of AdamOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + "Beta2 power accumulator should have 1 dimension"); + + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + "Param and Moment1 input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + "Param and Moment2 input of AdamOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("Moment1Out", param_dims); + ctx->SetOutputDim("Moment2Out", param_dims); + } +}; + +class AdamOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("Moment1", "(Tensor) Input first moment"); + AddInput("Moment2", "(Tensor) Input second moment"); + AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); + AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("Moment1Out", "(Tensor) Output first moment"); + AddOutput("Moment2Out", "(Tensor) Output second moment"); + + AddAttr("beta1", + "(float, default 0.9) " + "Exponential decay rate for the " + "first moment estimates.") + .SetDefault(0.9f); + AddAttr("beta2", + "(float, default 0.999) " + "exponential decay rate for the " + "second moment estimates.") + .SetDefault(0.999f); + AddAttr("epsilon", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0e-8f); + + AddComment(R"DOC( +Adam Optimizer. + +This implements the Adam optimizer from Section 2 of the Adam +paper : https://arxiv.org/abs/1412.6980. +Adam is a first-order gradient-based optimization method based on +adaptive estimates of lower-order moments. + +Adam updates: + +$$ +moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\ +moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\ +learning\_rate = learning\_rate * + \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); +REGISTER_OP_CPU_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/fluid/operators/adam_op.cu b/paddle/fluid/operators/adam_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..85b806eb6a1c0ed28cd47331786b66fc2e3a21eb --- /dev/null +++ b/paddle/fluid/operators/adam_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adam_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a51b46ef15778cf83d4d4f9c2d8f366b1c5d6b9f --- /dev/null +++ b/paddle/fluid/operators/adam_op.h @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include // for sqrt in CPU and CUDA +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +namespace scatter = paddle::operators::math::scatter; + +template +struct AdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + inline HOSTDEVICE void operator()(size_t i) const { + // Merge all memory access together. + T g = grad_[i]; + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } +}; + +template +struct SparseAdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel) {} + + inline HOSTDEVICE void operator()(size_t i) const { + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + for (int64_t j = 0; j < row_numel_; ++j) { + T g = grad_[i * row_numel_ + j]; + T mom1 = moment1_[rows_[i] * row_numel_ + j]; + T mom2 = moment2_[rows_[i] * row_numel_ + j]; + T lr = *lr_; + T p = param_[rows_[i] * row_numel_ + j]; + + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + + moment1_out_[rows_[i] * row_numel_ + j] = mom1; + moment2_out_[rows_[i] * row_numel_ + j] = mom2; + param_out_[rows_[i] * row_numel_ + j] = p; + } // for col id + } +}; + +template +class AdamOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using paddle::framework::LoDTensor; + using paddle::operators::detail::Ref; + + T beta1 = static_cast(ctx.Attr("beta1")); + T beta2 = static_cast(ctx.Attr("beta2")); + T epsilon = static_cast(ctx.Attr("epsilon")); + auto& param = Ref(ctx.Input("Param"), "Must set Param"); + // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto* grad_var = ctx.InputVar("Grad"); + auto& mom1 = Ref(ctx.Input("Moment1"), "Must set Moment1"); + auto& mom2 = Ref(ctx.Input("Moment2"), "Must set Moment2"); + auto& lr = + Ref(ctx.Input("LearningRate"), "Must set LearningRate"); + + auto& beta1_pow = + Ref(ctx.Input("Beta1Pow"), "Must set Beta1Pow"); + auto& beta2_pow = + Ref(ctx.Input("Beta2Pow"), "Must set Beta2Pow"); + + auto& param_out = + Ref(ctx.Output("ParamOut"), "Must set ParamOut"); + auto& mom1_out = + Ref(ctx.Output("Moment1Out"), "Must set Moment1Out"); + auto& mom2_out = + Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); + + if (grad_var->IsType()) { + auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + AdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } else if (grad_var->IsType()) { + auto& grad = + Ref(ctx.Input("Grad"), "Must set Grad"); + // merge duplicated rows if any. + scatter::MergeAdd merge_func; + auto grad_merge = + merge_func(ctx.template device_context(), grad); + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + int64_t* rows = nullptr; + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace()); + } else { + rows = grad_merge.mutable_rows()->data(); + } + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); + platform::ForRange for_range( + static_cast(ctx.device_context()), + grad_merge.rows().size()); + for_range(functor); + } else { + PADDLE_THROW("Variable type not supported by adam_op"); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2249b8f96da86438748ab5b2b0f748cc590b8f7 --- /dev/null +++ b/paddle/fluid/operators/adamax_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adamax_op.h" + +namespace paddle { +namespace operators { + +class AdamaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InfNorm"), + "Input(InfNorm) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamaxOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"), + "Output(InfNormOut) of AdamaxOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamaxOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of AdamaxOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("InfNorm"), + "Param and InfNorm input of AdamaxOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + ctx->SetOutputDim("InfNormOut", param_dims); + } +}; + +class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("Moment", "(Tensor) First moment"); + AddInput("InfNorm", + "(Tensor) " + "Input exponentially weighted infinity norm"); + AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output first moment"); + AddOutput("InfNormOut", + "(Tensor) " + "Output exponentially weighted infinity norm"); + + AddAttr("beta1", + "(float, default 0.9) " + "Exponential decay rate for the " + "1st moment estimates.") + .SetDefault(0.9f); + AddAttr("beta2", + "(float, default 0.999) " + "exponential decay rate for the weighted " + "infinity norm estimates.") + .SetDefault(0.999f); + AddAttr("epsilon", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0e-8f); + AddComment(R"DOC( +Adamax Optimizer. + +We implement the Adamax optimizer from Section 7 of the Adam +paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the +Adam algorithm based on the infinity norm. + +Adamax updates: + +$$ +moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\ +inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\ +learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out} +$$ + +The original paper does not have an epsilon attribute. +However, it is added here for numerical stability to prevent the +division by 0 error. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); +REGISTER_OP_CPU_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/fluid/operators/adamax_op.cu b/paddle/fluid/operators/adamax_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..44a5d6c7bdeac94ceb710d981c6445c046528cb0 --- /dev/null +++ b/paddle/fluid/operators/adamax_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adamax_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..124453c0eceb4caa53bf63a8d9e8c4b90a2213c9 --- /dev/null +++ b/paddle/fluid/operators/adamax_op.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class AdamaxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + auto inf_norm_out_tensor = ctx.Output("InfNormOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + inf_norm_out_tensor->mutable_data(ctx.GetPlace()); + + T beta1 = static_cast(ctx.Attr("beta1")); + T beta2 = static_cast(ctx.Attr("beta2")); + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto inf_norm = framework::EigenVector::Flatten( + *ctx.Input("InfNorm")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + auto beta1_pow = framework::EigenVector::Flatten( + *ctx.Input("Beta1Pow")); + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto inf_norm_out = + framework::EigenVector::Flatten(*inf_norm_out_tensor); + auto* place = ctx.template device_context().eigen_device(); + + moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; + inf_norm_out.device(*place) = + grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); + auto lr_t = lr / (1 - beta1_pow); + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(*place) = + param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..4ffb414ecea350006e5a370a0b25ae304cace89c --- /dev/null +++ b/paddle/fluid/operators/array_operator.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +class ArrayOp : public framework::OperatorBase { + public: + ArrayOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + size_t GetOffset(const framework::Scope &scope, + const platform::Place &place) const { + auto *i = scope.FindVar(Input("I")); + PADDLE_ENFORCE(i != nullptr, "I must be set"); + auto &i_tensor = i->Get(); + PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + size_t offset; + if (platform::is_gpu_place(i_tensor.place())) { + // FIXME: Avoid copy from GPU to CPU + framework::Tensor t; + framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t); + dev_ctx.Wait(); + offset = static_cast(*t.data()); + } else { + offset = static_cast(*i_tensor.data()); + } + VLOG(10) << " Offset = " << offset; + return offset; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf8e11bd8c047275fe341ead9424d02e98d5d8f4 --- /dev/null +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -0,0 +1,177 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +using LoD = framework::LoD; + +class ArrayToLoDTensorOp : public framework::OperatorBase { + public: + ArrayToLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &rank_table = + scope.FindVar(Input("RankTable"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + + // Check dims, place and data type of input's elements and infer output's + // dim + PADDLE_ENFORCE(!x.empty(), "There's no element in the input array."); + int rank = x[0].dims().size(); + platform::Place place = x[0].place(); + std::type_index data_type = x[0].type(); + framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank); + int64_t batch_size = x[0].dims()[0]; + for (size_t i = 1; i < x.size(); ++i) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims, + "The dimension of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place), + "The place class of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + PADDLE_ENFORCE(x[i].type() == data_type, + "The date type of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + batch_size += x[i].dims()[0]; + } + auto ins_dim_vec = framework::vectorize(ins_dims); + ins_dim_vec.insert(ins_dim_vec.begin(), batch_size); + framework::DDim out_dims = framework::make_ddim(ins_dim_vec); + out->Resize(out_dims); + out->mutable_data(place, data_type); + + auto &table_items = rank_table.items(); + std::vector table_item_idx(table_items.size()); + // table_item_idx = range(table_items_idx.size()) + std::iota(table_item_idx.begin(), table_item_idx.end(), 0); + std::sort(table_item_idx.begin(), table_item_idx.end(), + [&](size_t a, size_t b) { + return table_items[a].index < table_items[b].index; + }); + + // Build LoDTensor `out` + framework::LoD *out_lod = out->mutable_lod(); + out_lod->clear(); + size_t out_offset = 0; + auto prefix_lod = rank_table.coarse_lod(); + prefix_lod.emplace_back(); + auto &cur_level_lod = prefix_lod.back(); + cur_level_lod.push_back(0); + for (size_t idx : table_item_idx) { + cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length); + for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) { + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x[x_idx].lod(), idx, idx + 1, 0); + + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(out_lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " [" + << ", " << end_offset << "]"; + // Copy data + PADDLE_ENFORCE_GE(end_offset, start_offset); + size_t len = end_offset - start_offset; + if (len == 0) { + continue; + } + auto slice = out->Slice(out_offset, out_offset + len); + + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::Copy(x[x_idx].Slice(start_offset, end_offset), place, + dev_ctx, &slice); + out_offset += len; + } + } + out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end()); + } +}; + +class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(std::vector) A vector of tensors that is going to " + "be casted to a big LoDTensor."); + AddInput("RankTable", + "(LoDRankTable) RankTable provides the coarse lod infomation to " + "build the output LoDTensor. See " + "'paddle/framework/lod_rank_table.h' for more details."); + AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array."); + AddComment( + R"DOC(This Op build a big LoDTensor from a std::vector + and a LoDRankTable. It is supposed to be used in getting dynamic RNN's + outputs back to a normal LoDTensor. The std::vector + would be the output of RNN Op and the LoDRankTable would be build + with RNN's input.)DOC"); + } +}; + +class ArrayToLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "ArrayToLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("RankTable"), + "ArrayToLoDTensorOp must has input RankTable."); + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("lod_tensor_to_array"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("RankTable", Input("RankTable")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp, + ops::ArrayToLoDTensorOpProtoMaker, + ops::ArrayToLoDTensorInferShape, + ops::ArrayToLoDTensorGradMaker); diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f99f9af4276c0e8928f821ae166d55aed02e8e27 --- /dev/null +++ b/paddle/fluid/operators/assign_op.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +class AssignFunctor { + public: + AssignFunctor(framework::Variable *out, + const platform::DeviceContext &dev_ctx) + : out_(out), dev_ctx_(dev_ctx) {} + + void operator()(const framework::LoDTensor &lod_tensor) const { + auto &out_tensor = *out_->GetMutable(); + copy_tensor(lod_tensor, &out_tensor); + } + + void operator()(const framework::LoDTensorArray &array) const { + auto &out_array = *out_->GetMutable(); + out_array.resize(array.size()); + for (size_t i = 0; i < array.size(); ++i) { + copy_tensor(array[i], &out_array[i]); + } + } + + void operator()(const framework::SelectedRows &rows) const { + framework::SelectedRows &out_rows = + *out_->GetMutable(); + out_rows.set_rows(rows.rows()); + out_rows.set_height(rows.height()); + auto &t = rows.value(); + auto *m = out_rows.mutable_value(); + framework::Copy(t, t.place(), dev_ctx_, m); + } + + template + void operator()(const T &v) const { + PADDLE_THROW("Not support type for assign op %s", typeid(T).name()); + } + + private: + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { + auto &out_tensor = *out; + Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + + framework::Variable *out_; + const platform::DeviceContext &dev_ctx_; +}; + +class AssignOp : public framework::OperatorBase { + public: + AssignOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + if (x == nullptr) { + return; + } + auto *out = scope.FindVar(Output("Out")); + PADDLE_ENFORCE( + out != nullptr, + "The Output(Out) should not be null if the Input(X) is set."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); + } +}; + +class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, SelectedRows or LoDTensorArray) The input variable " + "could be LoDTensor, SelectedRows or LoDTensorArray.") + .AsDispensable(); + AddOutput("Out", + "(LoDTensor, SelectedRows or LoDTensorArray) The type of output " + "is the same as input X."); + AddComment(R"DOC(Assign Operator + +Out = X, when type in [LoDTensor/SelectedRows/LoDTensorArray] +raise error if the type is not listed above. +)DOC"); + } +}; + +class AssignInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + if (context->HasInput("X")) { + auto type = context->GetInputsVarType("X")[0]; + if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS || + type == framework::proto::VarDesc_VarType_LOD_TENSOR) { + context->SetOutputDim("Out", context->GetInputDim("X")); + } + } + } +}; + +class AssignGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("assign"); + op->SetInput("X", OutputGrad("Out")); + op->SetOutput("Out", InputGrad("X")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, + ops::AssignInferShape, ops::AssignOpProtoMaker); diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..835043d9ab49a20a73d5dd0fff936cb3e9473b1e --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/assign_value_op.h" + +namespace paddle { +namespace operators { + +class AssignValueOp : public framework::OperatorWithKernel { + public: + AssignValueOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of AssignValueOp should not be null."); + auto shape = ctx->Attrs().Get>("shape"); + ctx->SetOutputDim("Out", framework::make_ddim(shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::proto::DataType(ctx.Attr("dtype")), ctx.GetPlace()); + } +}; + +class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) Output tensor of assign_value operator."); + AddAttr>("shape", + "(vector) " + "Shape of values."); + AddAttr("dtype", "data type of values") + .InEnum({framework::proto::DataType::INT32, + framework::proto::DataType::FP32}); + AddAttr>("fp32_values", "store the float values") + .SetDefault({}); + AddAttr>("int32_values", "store the int values") + .SetDefault({}); + AddComment(R"DOC( +AssignValue operator + +$$Out = values$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker); +REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..616163f97b9b917187ff66339c01f95289f2f618 --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.cu.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/assign_value_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h new file mode 100644 index 0000000000000000000000000000000000000000..33a344cad596a079faf2582ee1d9dc497531465a --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class AssignValueKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto shape = ctx.Attr>("shape"); + auto* out = ctx.Output("Out"); + int dtype = ctx.Attr("dtype"); + const char* value_name = nullptr; + switch (dtype) { + case framework::proto::DataType::INT32: + value_name = "int32_values"; + break; + case framework::proto::DataType::FP32: + value_name = "fp32_values"; + break; + default: + PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype); + break; + } + auto values = ctx.Attr>(value_name); + framework::CopyFromVector(values, ctx.device_context(), out); + out->Resize(framework::make_ddim(shape)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ac08ea4a19b981b0dc8dac43e4ae5de7b09bb5d --- /dev/null +++ b/paddle/fluid/operators/auc_op.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/auc_op.h" + +namespace paddle { +namespace operators { + +class AucOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input of Indices should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input of Label should not be null."); + auto inference_height = ctx->GetInputDim("Out")[0]; + auto label_height = ctx->GetInputDim("Label")[0]; + + PADDLE_ENFORCE_EQ(inference_height, label_height, + "Out and Label should have same height."); + + ctx->SetOutputDim("AUC", {1}); + ctx->ShareLoD("Out", /*->*/ "AUC"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Out")->type()), + ctx.device_context()); + } +}; + +class AucOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AucOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Out", + "A floating point 2D tensor, values are in the range [0, 1]." + "Each row is sorted in descending order. This input should be the" + "output of topk." + "Typically, this tensor indicates the probability of each label"); + AddInput("Indices", + "An int 2D tensor, indicating the indices of original" + "tensor before sorting. Typically, this tensor indicates which " + "label the probability stands for."); + AddInput("Label", + "A 2D int tensor indicating the label of the training data." + "The height is batch size and width is always 1."); + // TODO(typhoonzero): support weight input + AddOutput("AUC", + "A scalar representing the " + "current area-under-the-curve."); + + AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") + .SetDefault("ROC"); + AddAttr("num_thresholds", + "The number of thresholds to use when discretizing the" + " roc curve.") + .SetDefault(200); + + AddComment(R"DOC( +Area Under The Curve (AUC) Operator. + +This implementation computes the AUC according to forward output and label. +It is used very widely in binary classification evaluation. As a note: +If input label contains values other than 0 and 1, it will be cast +to bool. You can find the relevant definitions here: +https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve + +There are two types of possible curves: +1. ROC: Receiver operating characteristic +2. PR: Precision Recall +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); +REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel); diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e648db70974087f84020f45c568fb0c1924a88dd --- /dev/null +++ b/paddle/fluid/operators/auc_op.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenVector = framework::EigenVector; + +template +class AucKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* inference = ctx.Input("Out"); + auto* label = ctx.Input("Label"); + auto* auc = ctx.Output("AUC"); + + float* auc_data = auc->mutable_data(ctx.GetPlace()); + + std::string curve = ctx.Attr("curve"); + int num_thresholds = ctx.Attr("num_thresholds"); + std::vector thresholds_list; + thresholds_list.reserve(num_thresholds); + for (int i = 1; i < num_thresholds - 1; i++) { + thresholds_list[i] = (float)i / (num_thresholds - 1); + } + const float kEpsilon = 1e-7; + thresholds_list[0] = 0.0f - kEpsilon; + thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon; + + size_t batch_size = inference->dims()[0]; + size_t inference_width = inference->dims()[1]; + + const T* inference_data = inference->data(); + const int64_t* label_data = label->data(); + + // Create local tensor for storing the curve: TP, FN, TN, FP + // TODO(typhoonzero): use eigen op to caculate these values. + Tensor true_positive, false_positive, true_negative, false_negative; + + true_positive.Resize({num_thresholds}); + false_negative.Resize({num_thresholds}); + true_negative.Resize({num_thresholds}); + false_positive.Resize({num_thresholds}); + + int64_t* tp_data = true_positive.mutable_data(ctx.GetPlace()); + int64_t* fn_data = false_negative.mutable_data(ctx.GetPlace()); + int64_t* tn_data = true_negative.mutable_data(ctx.GetPlace()); + int64_t* fp_data = false_positive.mutable_data(ctx.GetPlace()); + + for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) { + // caculate TP, FN, TN, FP for current thresh + int64_t tp = 0, fn = 0, tn = 0, fp = 0; + for (size_t i = 0; i < batch_size; i++) { + // NOTE: label_data used as bool, labels >0 will be treated as true. + if (label_data[i]) { + // use first(max) data in each row + if (inference_data[i * inference_width] >= + (thresholds_list[idx_thresh])) { + tp++; + } else { + fn++; + } + } else { + if (inference_data[i * inference_width] >= + (thresholds_list[idx_thresh])) { + fp++; + } else { + tn++; + } + } + } + // store rates + tp_data[idx_thresh] = tp; + fn_data[idx_thresh] = fn; + tn_data[idx_thresh] = tn; + fp_data[idx_thresh] = fp; + } + // epsilon to avoid divide by zero. + float epsilon = 1e-6; + // Riemann sum to caculate auc. + Tensor tp_rate, fp_rate, rec_rate; + tp_rate.Resize({num_thresholds}); + fp_rate.Resize({num_thresholds}); + rec_rate.Resize({num_thresholds}); + float* tp_rate_data = tp_rate.mutable_data(ctx.GetPlace()); + float* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); + float* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); + for (int i = 0; i < num_thresholds; i++) { + tp_rate_data[i] = + ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); + fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); + rec_rate_data[i] = + ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); + } + *auc_data = 0.0f; + if (curve == "ROC") { + for (int i = 0; i < num_thresholds - 1; i++) { + auto dx = fp_rate_data[i] - fp_rate_data[i + 1]; + auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f; + *auc_data = *auc_data + dx * y; + } + } else if (curve == "PR") { + for (int i = 1; i < num_thresholds; i++) { + auto dx = tp_rate_data[i] - tp_rate_data[i - 1]; + auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f; + *auc_data = *auc_data + dx * y; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..506c25d50d453ef841e6885c412ccff38f25cebb --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -0,0 +1,448 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/framework/data_layout.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class BatchNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); + PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); + PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); + PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); + PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), ""); + PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), ""); + PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); + PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); + + // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], + "Mean and MeanOut should share the same memory"); + PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], + ctx->Outputs("VarianceOut")[0], + "Variance and VarianceOut should share the same memory"); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C); + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); + } +}; + +class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("is_test", "").SetDefault(false); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Mean", + "The global mean (for training) or " + "estimated mean (for testing)"); + AddInput("Variance", + "The global variance (for training) " + "or estimated Variance (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddComment(R"DOC( +Batch Normalization. + +Batch Norm has been implemented as discussed in the paper: +https://arxiv.org/pdf/1502.03167.pdf +Can be used as a normalizer function for conv2d and fully_connected operations. +The required data format for this layer is one of the following: +1. NHWC `[batch, in_height, in_width, in_channels]` +2. NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); + } +}; + +template +class BatchNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x->numel() / N / C; + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + if (!is_test) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e( + saved_mean->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap saved_variance_e( + saved_variance->mutable_data(ctx.GetPlace()), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); + for (int i = 0; i < N * sample_size; ++i) { + saved_mean_e += x_arr.col(i); + } + saved_mean_e /= N * sample_size; + for (int i = 0; i < N * sample_size; ++i) { + saved_variance_e += + (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); + } + saved_variance_e /= N * sample_size; + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + + EigenVectorArrayMap running_mean_arr( + mean_out->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap running_var_arr( + variance_out->mutable_data(ctx.GetPlace()), C); + running_mean_arr = + running_mean_arr * momentum + saved_mean_e * (1. - momentum); + running_var_arr = + running_var_arr * momentum + saved_variance_e * (1. - momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (is_test) { + ConstEigenVectorArrayMap var_arr( + ctx.Input("Variance")->data(), C); + inv_std = (var_arr + epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std( + ctx.Output("SavedVariance")->data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + ConstEigenVectorArrayMap mean_arr( + is_test ? ctx.Input("Mean")->data() + : ctx.Output("SavedMean")->data(), + C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap bias_arr(bias->data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (data_layout) { + case DataLayout::kNCHW: { + EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, + N * C); + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + case DataLayout::kNHWC: { + EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, + N * sample_size) = + (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * + new_scale) + .colwise() + + new_bias; + break; + } + default: + PADDLE_THROW("Unknown storage order: %d", data_layout); + } + } +}; + +class BatchNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); + PADDLE_ENFORCE(ctx->HasInput("SavedMean"), ""); + PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), ""); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), ""); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + return framework::OpKernelType(framework::ToDataType(t->type()), + ctx.GetPlace()); + } +}; + +template +class BatchNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *saved_mean = ctx.Input("SavedMean"); + // SavedVariance have been reverted in forward operator + const auto *saved_inv_variance = ctx.Input("SavedVariance"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x->numel() / N / C; + + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap mean_arr(saved_mean->data(), C); + ConstEigenVectorArrayMap inv_var_arr(saved_inv_variance->data(), C); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + + // d_bias = np.sum(d_y, axis=0) + // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + + EigenVectorArrayMap d_bias_arr(d_bias->mutable_data(ctx.GetPlace()), + C); + EigenVectorArrayMap d_scale_arr(d_scale->mutable_data(ctx.GetPlace()), + C); + + d_bias_arr.setZero(); + d_scale_arr.setZero(); + + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size); + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), + sample_size, N * C); + d_x_arr.setZero(); + + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_bias_arr(c) += d_y_arr.col(nc).sum(); + d_scale_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + .sum(); + } + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) += + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c)); + } + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, + N * sample_size); + d_x_arr.setZero(); + + const auto d_y_row_sum = d_y_arr.rowwise().sum(); + const auto x_minus_mean = x_arr.colwise() - mean_arr; + const auto d_y_mul_x_minus_mean_row_sum = + (d_y_arr * x_minus_mean).rowwise().sum(); + const auto inv_var_sqr = inv_var_arr * inv_var_arr; + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_bias_arr += d_y_arr.col(nhw); + d_scale_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + d_x_arr.col(nhw) += + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum - + x_minus_mean.col(nhw) * inv_var_sqr * + d_y_mul_x_minus_mean_row_sum); + } + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, + batch_norm_grad, ops::BatchNormGradOp); +REGISTER_OP_CPU_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CPU_KERNEL( + batch_norm_grad, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9c97211e14c0ef3a99a7e2b5cbfd8b267d40c1e --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -0,0 +1,278 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/framework/data_layout.h" + +#include +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; + +void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, + int *N, int *C, int *H, int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + +template +class BatchNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + VLOG(1) << "Setting descriptors."; + std::vector dims; + std::vector strides; + if (data_layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * D * C, 1, W * D * C, D * C, C}; + } + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + auto &dev_ctx = ctx.template device_context(); + math::SetConstant functor; + functor(dev_ctx, saved_mean, 0); + functor(dev_ctx, saved_variance, 0); + + auto handle = dev_ctx.cudnn_handle(); + + // Now, depending on whether we are running test or not, we have two paths. + if (is_test) { + // only when test we use input to do computation. + const auto *est_mean = ctx.Input("Mean"); + const auto *est_var = ctx.Input("Variance"); + // Run inference mode. + PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(est_mean->dims()[0], C); + PADDLE_ENFORCE_EQ(est_var->dims()[0], C); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, y->template mutable_data(ctx.GetPlace()), + bn_param_desc_, scale->template data(), bias->template data(), + est_mean->template data(), est_var->template data(), epsilon)); + } else { + // Run training mode. + // obtain running mean and running inv var, and see if we need to + // initialize them. + double this_factor = 1. - momentum; + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), + data_desc_, x->template data(), data_desc_, + y->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data(), bias->template data(), this_factor, + mean_out->template mutable_data(ctx.GetPlace()), + variance_out->template mutable_data(ctx.GetPlace()), epsilon, + saved_mean->template mutable_data(ctx.GetPlace()), + saved_variance->template mutable_data(ctx.GetPlace()))); + } + + // clean when exit. + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +template +class BatchNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + std::vector dims; + std::vector strides; + if (data_layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const void *saved_mean_data = saved_mean->template data(); + const void *saved_var_data = saved_var->template data(); + + auto &dev_ctx = ctx.template device_context(); + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, + d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data(), + d_scale->template mutable_data(ctx.GetPlace()), + d_bias->template mutable_data(ctx.GetPlace()), epsilon, + saved_mean_data, saved_var_data)); + + // clean when exit. + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CUDA_KERNEL( + batch_norm_grad, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fa9942ad099f4a28a3abc68c676edeeb827aacd0 --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class BatchNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class BatchNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7737d4e098ac9a0e56e1db2aee796550e8d71ba3 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct BeamSearchDecodeFunctor { + BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, LoDTensor* score_tensor) + : step_ids_(step_ids), + step_scores_(step_scores), + id_tensor_(id_tensor), + score_tensor_(score_tensor) {} + + template + void operator()() const; + + const LoDTensorArray& step_ids_; + const LoDTensorArray& step_scores_; + LoDTensor* id_tensor_; + LoDTensor* score_tensor_; +}; + +template +void BeamSearchDecodeFunctor::operator()() const { + BeamSearchDecoder beam_search_decoder; + beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, + score_tensor_); +} + +template <> +void BeamSearchDecodeFunctor::operator()() const { + PADDLE_THROW("beam search decode op does not support bool!"); +} + +class BeamSearchDecodeOp : public framework::OperatorBase { + public: + BeamSearchDecodeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); + + framework::ExecutionContext ctx(*this, scope, dev_ctx); + + const LoDTensorArray* ids = ctx.Input("Ids"); + const LoDTensorArray* scores = ctx.Input("Scores"); + const size_t step_num = ids->size(); + PADDLE_ENFORCE_GT(step_num, 0UL, + "beam search steps should be larger than 0"); + const size_t source_num = ids->at(0).lod().at(0).size() - 1; + PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0"); + + for (size_t i = 0; i < step_num; ++i) { + PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL, + "Level of LodTensor should be 2"); + } + + // prepare output + LoDTensor* sentenceIds = ctx.Output("SentenceIds"); + LoDTensor* sentenceScores = ctx.Output("SentenceScores"); + + framework::VisitDataType( + framework::ToDataType(scores->at(0).type()), + BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores)); + } +}; + +class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ids", + "(LodTensorArray)" + "score of the candidate words in each step"); + AddInput("Scores", + "(LodTensorArray)" + "score of the candidate words in each step"); + AddOutput("SentenceIds", + "(LodTensor)" + "All possible result sentences of word ids"); + AddOutput("SentenceScores", + "(LodTensor)" + "All possible result sentences of word scores"); + AddComment(R"DOC( +Pack the result of Beam search op into SentenceIds and SentenceScores. +)DOC"); + } +}; + +class BeamSearchDecodeInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("Ids"), + "BeamSearchDecodeOp must has input Ids"); + PADDLE_ENFORCE(context->HasInput("Scores"), + "BeamSearchDecodeOp must has input Scores"); + PADDLE_ENFORCE(context->HasOutput("SentenceIds"), + "BeamSearchDecodeOp must has output SentenceIds"); + PADDLE_ENFORCE(context->HasOutput("SentenceScores"), + "BeamSearchDecodeOp must has output SentenceScores"); + } +}; + +class BeamSearchDecodeInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + for (auto& o : op_desc.Output("SentenceIds")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + for (auto& o : op_desc.Output("SentenceScores")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp, + paddle::operators::BeamSearchDecodeOpProtoMaker, + paddle::operators::BeamSearchDecodeInferShape, + paddle::operators::BeamSearchDecodeInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aeecb8d39acf1e2761aec62b89322c9cbbfe7445 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -0,0 +1,280 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; + +// all the lod have 2 levels. +// The First is source level, the second is sentence level. +// source level describe how many candidate words for this source. +// sentence level describe these candidates belong to which prefix +const size_t kSourceLevel = 0; +const size_t kSentenceLevel = 1; + +template +struct BeamNode { + BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {} + + ~BeamNode() { + if (parent_) { + parent_->DropKid(this); + if (parent_->kids_.size() == 0UL) { + delete parent_; + } + } + VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_; + } + + void AppendTo(BeamNode* parent) { + parent_ = parent; + parent->kids_.insert(this); + } + + void DropKid(BeamNode* kid) { kids_.erase(kid); } + + BeamNode* parent_ = nullptr; + std::unordered_set kids_; + int64_t word_id_; + T score_; +}; + +template +using BeamNodeVector = std::vector>>; + +template +struct Sentence { + std::vector word_ids; + std::vector scores; +}; + +template +using SentenceVector = std::vector>; + +template +struct BeamSearchDecoder { + /** + * make a BeamNode and all it's related prefix BeanNode into a Sentence. + */ + Sentence MakeSentence(const BeamNode* node) const; + + /** + * Param: + * cur_ids: LoDTensor of One step for word ID + * cur_scores: LoDTensor of One Step for word score + * prefixes_list: prefixes for each source sentence. + * sentence_vector_list: result sentence_vector for each source sentence. + * Return: + * a new prefixes list for each source of current step + */ + std::vector> PackTwoSteps( + const LoDTensor& cur_ids, const LoDTensor& cur_scores, + std::vector>& prefixes_list, + std::vector>* sentence_vector_list) const; + + /** + * convert the result sentence_vector for each source sentence into two + * LodTensor. + * One is all candidate sentences with word id, one is all candidate sentences + * with word score. + * Param: + * sentence_vector_list: sentence_vector for each source sentence. + * id_tensor: result LoDTensor for sentences of id. + * score_tensor: result LoDTensor for sentences of score. + */ + void ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; + + /** + * Pack all steps of id/score LodTensor into sentence LoDTensor + * it's main logic is: + * ```python + * prefix + * result_sentence + * result_lod_tensor + * + * for (step in steps): + * prefix = PackTwoSteps(prefix, step, &result_sentence) + * ConvertSentenceVectorToLodTensor(result_sentence, &result_lod_tensor) + * ``` + */ + void PackAllSteps(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; +}; + +template +Sentence BeamSearchDecoder::MakeSentence(const BeamNode* node) const { + Sentence sentence; + while (node != nullptr) { + sentence.word_ids.emplace_back(node->word_id_); + sentence.scores.emplace_back(node->score_); + node = node->parent_; + } + + std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids)); + std::reverse(std::begin(sentence.scores), std::end(sentence.scores)); + + return sentence; +} + +template +std::vector> BeamSearchDecoder::PackTwoSteps( + const LoDTensor& cur_ids, const LoDTensor& cur_scores, + std::vector>& prefixes_list, + std::vector>* sentence_vector_list) const { + std::vector> result; + + for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1; + ++src_idx) { + size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx]; + size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; + + BeamNodeVector beam_nodes; + + // if prefixes size is 0, it means this is the first step. In this step, + // all candidate id is the start of candidate sentences. + if (prefixes_list.empty()) { + PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(), + cur_ids.lod().at(kSentenceLevel).back(), + "in the first step"); + for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) { + beam_nodes.push_back(std::unique_ptr>(new BeamNode( + cur_ids.data()[id_idx], cur_scores.data()[id_idx]))); + } + } else { + BeamNodeVector& prefixes = prefixes_list[src_idx]; + SentenceVector& sentence_vector = (*sentence_vector_list)[src_idx]; + + PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(), + "prefix and candidate set number should be the same"); + + auto candidate_offset = cur_ids.lod()[kSentenceLevel]; + for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) { + std::unique_ptr>& prefix = prefixes[prefix_idx]; + size_t candidate_start = candidate_offset[src_start + prefix_idx]; + size_t candidate_end = candidate_offset[src_start + prefix_idx + 1]; + if (candidate_start == candidate_end) { + VLOG(3) << "this sentence has no more candidate, " + "add to result sentence and rm it from beam tree"; + sentence_vector.push_back(MakeSentence(prefix.get())); + prefix.reset(); + } else { + for (size_t candidate_idx = candidate_start; + candidate_idx < candidate_end; ++candidate_idx) { + auto* candidate = + new BeamNode(cur_ids.data()[candidate_idx], + cur_scores.data()[candidate_idx]); + candidate->AppendTo(prefix.get()); + beam_nodes.push_back(std::unique_ptr>(candidate)); + } + prefix.release(); + } + } + } + result.push_back(std::move(beam_nodes)); + } + return result; +} + +template +void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, LoDTensor* id_tensor, + LoDTensor* score_tensor) const { + size_t src_num = sentence_vector_list.size(); + + PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0"); + + std::vector source_level_lod = {0}; + std::vector sentence_level_lod = {0}; + std::vector id_data; + std::vector score_data; + + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + for (Sentence& sentence : sentence_vector_list[src_idx]) { + id_data.insert(id_data.end(), sentence.word_ids.begin(), + sentence.word_ids.end()); + score_data.insert(score_data.end(), sentence.scores.begin(), + sentence.scores.end()); + sentence_level_lod.push_back(sentence_level_lod.back() + + sentence.word_ids.size()); + } + source_level_lod.push_back(source_level_lod.back() + + sentence_vector_list[src_idx].size()); + } + + auto cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); + + framework::LoD lod; + lod.push_back(source_level_lod); + lod.push_back(sentence_level_lod); + + id_tensor->set_lod(lod); + id_tensor->Resize({static_cast(id_data.size())}); + id_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::CopyFromVector(id_data, cpu_ctx, id_tensor); + + score_tensor->set_lod(lod); + score_tensor->Resize({static_cast(score_data.size())}); + score_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::CopyFromVector(score_data, cpu_ctx, score_tensor); +} + +template +void BeamSearchDecoder::PackAllSteps(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const { + PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); + PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(), + "step_ids and step_scores should be the same"); + const size_t step_num = step_ids.size(); + const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; + + PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0"); + + // previous prefixes for each step, + // the init length is 0, means this is the first step. + std::vector> beamnode_vector_list(0); + std::vector> sentence_vector_list(src_num); + + // pack all steps for one batch first, then another batch + for (size_t step_id = 0; step_id < step_num; ++step_id) { + beamnode_vector_list = + PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id), + beamnode_vector_list, &sentence_vector_list); + } + // append last beam_node to result + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + for (auto& beam_node : beamnode_vector_list.at(src_idx)) { + sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get())); + beam_node.reset(); + } + } + + ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, + score_tensor); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f87279d5eaa19715c31b2228c7a22d4723efae --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_test.cc @@ -0,0 +1,221 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "gtest/gtest.h" + +using CPUPlace = paddle::platform::CPUPlace; +using LoD = paddle::framework::LoD; +using LoDTensor = paddle::framework::LoDTensor; +using LoDTensorArray = paddle::framework::LoDTensorArray; + +template +using BeamNode = paddle::operators::BeamNode; +template +using BeamSearchDecoder = paddle::operators::BeamSearchDecoder; +template +using Sentence = paddle::operators::Sentence; +template +using BeamNodeVector = paddle::operators::BeamNodeVector; +template +using SentenceVector = paddle::operators::SentenceVector; + +namespace paddle { +namespace test { + +void GenerateExample(const std::vector& level_0, + const std::vector& level_1, + const std::vector& data, LoDTensorArray* ids, + LoDTensorArray* scores) { + PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1, + "source level is used to describe candidate set"); + PADDLE_ENFORCE_EQ(level_1.back(), data.size(), + "the lowest level is used to describe data" + ", so it's last element should be data length"); + + CPUPlace place; + + LoD lod; + lod.push_back(level_0); + lod.push_back(level_1); + + // Ids + LoDTensor tensor_id; + tensor_id.set_lod(lod); + tensor_id.Resize({static_cast(data.size())}); + // malloc memory + int64_t* id_ptr = tensor_id.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + id_ptr[i] = static_cast(data.at(i)); + } + + // Scores + LoDTensor tensor_score; + tensor_score.set_lod(lod); + tensor_score.Resize({static_cast(data.size())}); + // malloc memory + float* score_ptr = tensor_score.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + score_ptr[i] = static_cast(data.at(i)); + } + + ids->push_back(tensor_id); + scores->push_back(tensor_score); +} + +} // namespace test +} // namespace paddle + +TEST(BeamSearchDecodeOp, DeleteBeamNode) { + auto* root = new BeamNode(0, 0); + auto* b1 = new BeamNode(1, 1); + auto* b2 = new BeamNode(2, 2); + auto* b3 = new BeamNode(3, 3); + + b1->AppendTo(root); + b2->AppendTo(root); + b3->AppendTo(b1); + + delete b3; + delete b2; +} + +TEST(BeamSearchDecodeOp, MakeSentence) { + auto* root = new BeamNode(0, 0); + auto* b1 = new BeamNode(1, 1); + auto* end = new BeamNode(2, 2); + b1->AppendTo(root); + end->AppendTo(b1); + + BeamSearchDecoder helper; + Sentence sentence = helper.MakeSentence(end); + delete end; + + std::vector expect_ids = {0, 1, 2}; + ASSERT_EQ(sentence.word_ids, expect_ids); + + std::vector expect_scores = {0, 1, 2}; + ASSERT_EQ(sentence.scores, expect_scores); +} + +TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) { + CPUPlace place; + + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample( + std::vector{0, 2, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, + std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); + + std::vector> beamnode_vector_list; + std::vector> sentence_vector_list( + 2, SentenceVector()); + + BeamSearchDecoder helper; + beamnode_vector_list = helper.PackTwoSteps( + ids[0], scores[0], beamnode_vector_list, &sentence_vector_list); + ASSERT_EQ(beamnode_vector_list.size(), 2UL); + ASSERT_EQ(beamnode_vector_list[0].size(), 2UL); + ASSERT_EQ(beamnode_vector_list[1].size(), 4UL); +} + +TEST(BeamSearchDecodeOp, PackTwoSteps) { + CPUPlace place; + + // first source has three prefix + BeamNodeVector source0_prefixes; + source0_prefixes.push_back( + std::unique_ptr>(new BeamNode(1, 1))); + source0_prefixes.push_back( + std::unique_ptr>(new BeamNode(0, 0))); + source0_prefixes.push_back( + std::unique_ptr>(new BeamNode(3, 3))); + + // second source has two prefix + BeamNodeVector source1_prefixes; + source1_prefixes.push_back( + std::unique_ptr>(new BeamNode(4, 4))); + source1_prefixes.push_back( + std::unique_ptr>(new BeamNode(5, 5))); + + std::vector> beamnode_vector_list; + std::vector> sentence_vector_list( + 2, SentenceVector()); + + beamnode_vector_list.push_back(std::move(source0_prefixes)); + beamnode_vector_list.push_back(std::move(source1_prefixes)); + + // generate data for one step + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample(std::vector{0, 3, 5}, + std::vector{0, 1, 1, 3, 4, 5}, + std::vector{0, 1, 2, 3, 4}, &ids, &scores); + + BeamSearchDecoder helper1; + beamnode_vector_list = helper1.PackTwoSteps( + ids[0], scores[0], beamnode_vector_list, &sentence_vector_list); + + ASSERT_EQ(sentence_vector_list[0].size(), 1UL); + ASSERT_EQ(sentence_vector_list[1].size(), 0UL); + ASSERT_EQ(beamnode_vector_list[0].size(), 3UL); + ASSERT_EQ(beamnode_vector_list[1].size(), 2UL); +} + +TEST(BeamSearchDecodeOp, PackAllSteps) { + CPUPlace place; + + // we will constuct a sample data with 3 steps and 2 source sentences + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample( + std::vector{0, 3, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, + std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); + paddle::test::GenerateExample( + std::vector{0, 3, 6}, std::vector{0, 1, 1, 3, 5, 5, 6}, + std::vector{0, 1, 2, 3, 4, 5}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 3, 6}, + std::vector{0, 0, 1, 2, 3, 4, 5}, + std::vector{0, 1, 2, 3, 4}, &ids, &scores); + + ASSERT_EQ(ids.size(), 3UL); + ASSERT_EQ(scores.size(), 3UL); + + BeamSearchDecoder helper; + + LoDTensor id_tensor; + LoDTensor score_tensor; + helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor); + + LoD lod = id_tensor.lod(); + std::vector expect_source_lod = {0, 4, 8}; + EXPECT_EQ(lod[0], expect_source_lod); + std::vector expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19}; + EXPECT_EQ(lod[1], expect_sentence_lod); + // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4 + std::vector expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5, + 4, 3, 2, 4, 4, 3, 6, 5, 4}; + ASSERT_EQ(id_tensor.dims()[0], static_cast(expect_data.size())); + for (size_t i = 0; i < expect_data.size(); ++i) { + ASSERT_EQ(id_tensor.data()[i], + static_cast(expect_data[i])); + } + for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) { + ASSERT_EQ(score_tensor.data()[i], + static_cast(id_tensor.data()[i])); + } +} diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f4c8c7e06ee17b4cf3880db7bc8ddfbb88df3b8 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cc @@ -0,0 +1,258 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +void BeamSearch::operator()(const framework::LoDTensor &pre_ids, + framework::LoDTensor *selected_ids, + framework::LoDTensor *selected_scores) { + auto abs_lod = framework::ToAbsOffset(ids_->lod()); + auto &high_level = abs_lod[lod_level_]; + + auto items = SelectTopBeamSizeItems(); + auto selected_items = ToMap(items, high_level.back()); + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset:" << i; + for (auto &item : selected_items[i]) { + VLOG(3) << ItemToString(item); + } + } + PruneEndidCandidates(pre_ids, &selected_items); + // calculate the output tensor's height + size_t num_instances = std::accumulate( + std::begin(selected_items), std::end(selected_items), 0, + [](size_t a, std::vector &b) { return a + b.size(); }); + // the output tensor shape should be [num_instances, 1] + auto dims = framework::make_ddim( + std::vector({static_cast(num_instances), 1})); + selected_ids->Resize(dims); + selected_scores->Resize(dims); + + std::map> hash; + framework::LoD new_lod; + auto *ids_data = selected_ids->mutable_data(platform::CPUPlace()); + auto *scores_data = + selected_scores->mutable_data(platform::CPUPlace()); + + // fill in data + std::vector low_level; + size_t low_offset = 0; + for (auto &items : selected_items) { + low_level.push_back(low_offset); + sort(items.begin(), items.end(), [](const Item &a, const Item &b) { + if (a.offset < b.offset) { + return true; + } + return a.id < b.id; + }); + for (auto &item : items) { + ids_data[low_offset] = item.id; + scores_data[low_offset] = item.score; + low_offset++; + } + } + low_level.push_back(low_offset); + + // fill lod + framework::LoD lod(2); + lod[0].assign(high_level.begin(), high_level.end()); + lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } + selected_ids->set_lod(lod); + selected_scores->set_lod(lod); +} + +int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, + std::vector> *items) { + auto *pre_ids_data = pre_ids.data(); + + int res = 0; + for (size_t offset = 0; offset < items->size(); offset++) { + auto prefix_id = pre_ids_data[offset]; + if (prefix_id == end_id_) { + items->at(offset).clear(); + } else { + res++; + } + } + + return res; +} + +std::vector> BeamSearch::ToMap( + const std::vector> &items, size_t element_num) { + std::vector> result; + result.resize(element_num); + for (auto &entries : items) { + for (const auto &item : entries) { + result[item.offset].push_back(item); + } + } + return result; +} + +std::vector> +BeamSearch::SelectTopBeamSizeItems() { + std::vector> result; + std::vector items; + // for each source sentence, select the top beam_size items across all + // candidate sets. + while (NextItemSet(&items)) { + std::nth_element(std::begin(items), std::begin(items) + beam_size_, + std::end(items), [](const Item &a, const Item &b) { + // TODO(superjom) make score's comparation customizable. + // partial sort in descending order + return a.score > b.score; + }); + // prune the top beam_size items. + if (items.size() > beam_size_) { + items.resize(beam_size_); + } + result.emplace_back(items); + } + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << ItemToString(item); + } + } + + return result; +} + +// the candidates of a source +bool BeamSearch::NextItemSet(std::vector *items) { + if (sent_offset_ >= ids_->NumElements(lod_level_)) { + return false; + } + // find the current candidates + auto ids = *ids_; + auto scores = *scores_; + + auto abs_lod = framework::ToAbsOffset(ids.lod()); + + auto *ids_data = ids.data(); + auto *scores_data = scores.data(); + + size_t instance_dim = 1; + for (int i = 1; i < ids.dims().size(); i++) { + instance_dim *= ids.dims()[i]; + } + + items->clear(); + items->reserve(framework::product(ids.dims())); + for (size_t offset = abs_lod[lod_level_][sent_offset_]; + offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { + for (size_t d = 0; d < instance_dim; d++) { + const size_t dim_offset = offset * instance_dim + d; + items->emplace_back(offset, ids_data[dim_offset], + scores_data[dim_offset]); + } + } + + sent_offset_++; + return true; +} + +std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { + os << "{"; + os << "offset: " << item.offset << ", "; + os << "id: " << item.id << ", "; + os << "score: " << item.score << ""; + os << "}"; + + return os; +} + +std::string ItemToString(const BeamSearch::Item &item) { + std::ostringstream stream; + stream << item; + return stream.str(); +} + +class BeamSearchProtoAndCheckerMaker + : public framework::OpProtoAndCheckerMaker { + public: + BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + // inputs and outputs stored in proto + AddInput("pre_ids", "ids in previous step"); + AddInput("ids", "a LoDTensor of shape of [None,k]"); + AddInput("scores", + "a LoDTensor that has the same shape and LoD with `ids`"); + AddOutput("selected_ids", + "a LoDTensor that stores the IDs selected by beam search"); + AddOutput( + "selected_scores", + "a LoDTensor that has the same shape and LoD with `selected_ids`"); + + // Attributes stored in AttributeMap + AddAttr("level", "the level of LoDTensor"); + AddAttr("beam_size", "beam size for beam search"); + AddAttr("end_id", + "the token id which indicates the end of a sequence"); + + AddComment( + "This is a beam search operator that help to generate sequences."); + } +}; + +class BeamSearchInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + for (const std::string &arg : + std::vector({"pre_ids", "ids", "scores"})) { + PADDLE_ENFORCE(context->HasInput(arg), + "BeamSearch need input argument '%s'", arg); + } + for (const std::string &arg : + std::vector({"selected_ids", "selected_scores"})) { + PADDLE_ENFORCE(context->HasOutput(arg), + "BeamSearch need output argument '%s'", arg); + } + } +}; + +class BeamSearchInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o : op_desc.Output("selected_ids")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + for (auto &o : op_desc.Output("selected_scores")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, + paddle::operators::BeamSearchProtoAndCheckerMaker, + paddle::operators::BeamSearchInferShape, + paddle::operators::BeamSearchInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9e2a05a60c30e388093aceddd40e58273364c8f9 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.h @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_TESTING +#include "gtest/gtest.h" +#endif + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +/* + * This is an implementation of beam search. + * + * To explain the details, lets take machine translation task for example, in + * this task, one source sentence is translated to multiple target sentences, + * during this period, one sentence will be translated to multiple translation + * prefixes(target sentence that have not ended), in each time step a prefix + * will have some candidates, input the candidate ids and their corresponding + * scores (probabilities), it will sort and select the top beam_size candidates + * for each source sentence, and store the selected candidates's score and their + * corresponding ids to LoDTensors. + * + * A detailed example: + * + * Input + * + * ids: + * LoD (should have 2 levels) + * first level: [0, 1, 4] + * second level: [0, 1, 2, 3, 4] + * + * tensor's data + * [ + * [4, 2, 5] + * [2, 1, 3] + * [3, 5, 2] + * [8, 2, 1] + * ] + * + * scores: + * LoD same as `ids` + * tensor's data + * [ + * [0.5, 0.3, 0.2] + * [0.6, 0.3, 0.1] + * [0.9, 0.5, 0.1] + * [0.7, 0.5, 0.1] + * ] + * + * the inputs means that there are 2 source sentences to translate, and the + * first source has 1 prefix, the second source has 2 prefix. + * + * lets assume beam size is 2, and the beam search's output should be + * LoD + * first level: + * [0, 1, 2] + * second level: + * [0, 2, 4] + * + * id tensor's data + * [[ + * 4, + * 1, + * 3, + * 8, + * ]] + * + * score tensor's data + * [[ + * 0.5, + * 0.3, + * 0.9, + * 0.7 + * ]] + * + * TODO all the prune operations should be in the beam search, so it is better + * to split the beam search algorithm into a sequence of smaller operators, and + * the prune operators can be inserted in this sequence. + */ +class BeamSearch { + public: + // TODO(superjom) make type customizable + using id_t = size_t; + using score_t = float; + /* + * Input the arguments that needed by this class. + */ + BeamSearch(const framework::LoDTensor& ids, + const framework::LoDTensor& scores, size_t level, size_t beam_size, + int end_id) + : beam_size_(beam_size), + ids_(&ids), + scores_(&scores), + lod_level_(level), + end_id_(end_id) {} + + /* + * The main function of beam search. + * + * @selected_ids: a [None, 1]-shaped tensor with LoD. + * In a machine translation model, it might be the candidate term id sets, + * each set stored as a varience-length sequence. + * The format might be described with a two-level LoD + * - [[0 1] + * - [0 1 2]] + * - [[] + * - [0 1]] + * the first level of LoD tells that there are two source sentences. The + * second level describes the details of the candidate id set's offsets in + * the + * source sentences. + * + * @selected_scores: a LoD tensor with the same shape and LoD with + * selected_ids. + * It stores the corresponding scores of candidate ids in selected_ids. + * + * Return false if all the input tensor is empty, in machine translation task + * that means no candidates is provided, and the task will stop running. + */ + void operator()(const framework::LoDTensor& pre_ids, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores); + /* + * The basic items help to sort. + */ + struct Item { + Item() {} + Item(size_t offset, size_t id, float score) + : offset(offset), id(id), score(score) {} + // offset in the higher lod level. + size_t offset; + // // prefix id in the lower lod level. + // size_t prefix; + // the candidate id + id_t id; + // the corresponding score + score_t score; + }; + + protected: + /* + * Delete all the records that follows the end token. + */ + int PruneEndidCandidates(const framework::LoDTensor& pre_ids, + std::vector>* items); + + /* + * Transform the items into a map whose key is offset, value is the items. + * NOTE low performance + */ + std::vector> ToMap( + const std::vector>& inputs, size_t element_num); + + /* + * For each source, select top beam_size records. + */ + std::vector> SelectTopBeamSizeItems(); + + /* + * Get the items of next source sequence, return false if no remaining items. + */ + bool NextItemSet(std::vector* items); + + private: + size_t beam_size_; + const framework::LoDTensor* ids_; + const framework::LoDTensor* scores_; + size_t lod_level_{0}; + size_t sent_offset_{0}; + int end_id_{0}; +}; + +std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); + +std::string ItemToString(const BeamSearch::Item& item); + +class BeamSearchOp : public framework::OperatorBase { + public: + BeamSearchOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + BeamSearchOp(const BeamSearchOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not Implemented"); + } + + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + auto ids_var = scope.FindVar(Input("ids")); + auto scores_var = scope.FindVar(Input("scores")); + auto pre_ids_var = scope.FindVar(Input("pre_ids")); + PADDLE_ENFORCE_NOT_NULL(ids_var); + PADDLE_ENFORCE_NOT_NULL(scores_var); + PADDLE_ENFORCE_NOT_NULL(pre_ids_var); + + auto& ids = ids_var->Get(); + auto& scores = scores_var->Get(); + auto& pre_ids = pre_ids_var->Get(); + size_t level = Attr("level"); + size_t beam_size = Attr("beam_size"); + int end_id = Attr("end_id"); + BeamSearch alg(ids, scores, level, beam_size, end_id); + + auto selected_ids_var = scope.FindVar(Output("selected_ids")); + auto selected_scores_var = scope.FindVar(Output("selected_scores")); + PADDLE_ENFORCE_NOT_NULL(selected_ids_var); + PADDLE_ENFORCE_NOT_NULL(selected_scores_var); + auto& selected_ids_tensor = + *selected_ids_var->GetMutable(); + auto& selected_scores_tensor = + *selected_scores_var->GetMutable(); + alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea2afda4d492ccde8889a394201b398eeff3badb --- /dev/null +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" + +#include +#include + +namespace paddle { +namespace test { + +using std::vector; +using framework::LoDTensor; +using framework::LoD; +using operators::BeamSearch; +using paddle::platform::CPUPlace; +using std::cout; +using std::endl; + +void CreateInput(LoDTensor* ids, LoDTensor* scores) { + LoD lod; + vector level0({0, 1, 4}); + vector level1({0, 1, 2, 3, 4}); + lod.push_back(level0); + lod.push_back(level1); + ids->set_lod(lod); + scores->set_lod(lod); + + auto dims = framework::make_ddim(vector({4, 3})); + ids->Resize(dims); + scores->Resize(dims); + CPUPlace place; + + auto* ids_data = ids->mutable_data(place); + auto* scores_data = scores->mutable_data(place); + vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + vector _scores( + {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1}); + + for (int i = 0; i < 12; i++) { + ids_data[i] = _ids[i]; + scores_data[i] = _scores[i]; + } +} + +TEST(beam_search_op, run) { + CPUPlace place; + LoDTensor ids, scores; + CreateInput(&ids, &scores); + + LoDTensor pre_ids; + pre_ids.Resize(framework::make_ddim(vector(4, 1))); + for (int i = 0; i < 4; i++) { + pre_ids.mutable_data(place)[i] = i + 1; + } + + BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0); + LoDTensor sids, sscores; + beamsearch(pre_ids, &sids, &sscores); + + LOG(INFO) << "score: " << sscores << endl; + + ASSERT_EQ(sids.lod(), sscores.lod()); + + vector tids({2, 4, 3, 8}); + vector tscores({0.3, 0.5, 0.9, 0.7}); + + for (int i = 0; i < 4; i++) { + ASSERT_EQ(tids[i], sids.data()[i]); + ASSERT_EQ(tscores[i], sscores.data()[i]); + } +} + +} // namespace test +} // namespace paddle diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc378b1b4536273e4364a488eb7a4ca2cc706782 --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bilinear_tensor_product_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class BilinearTensorProductOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto weight_dims = ctx->GetInputDim("Weight"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL, + "The input(Weight) must be a 3D tensor."); + PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], + "The first dimension(batch_size) of input(X) must be " + "equal to the first dimension of the input(Y)."); + PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1], + "The second dimension of input(X) must be equal to " + "the second dimension of the input(Weight)."); + PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2], + "The second dimension of input(Y) must be equal to " + "the third dimension of the input(Weight)."); + + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL, + "The Input(Bias) must be a 2-D tensor with " + "the 2nd dimension fixed to 1 (a row vector)."); + PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0], + "The second dimension of input(Bias) must be equal " + "to the first dimension of the input(Weight)."); + } + + ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of bilinear_tensor_product operator."); + AddInput("Y", "The second input of bilinear_tensor_product operator."); + AddInput("Weight", + "The learnable parameters of bilinear_tensor_product operator."); + AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.") + .AsDispensable(); + AddOutput("Out", "The output of bilinear_tensor_product operator."); + AddComment(R"DOC( +Bilinear Tensor Product operator. +Given input X and Y, a 3D tensor Weight and a Bias. Each column of the +Output is computed by one slice $i = 1, . . . , k$ of the tensor: + +$$ +M = (X W_i) * Y \\ +Out_i = \sum_j {M_j} + Bias_i +$$ + +Where $W_i$ is the $i$-th slice of Input(Weight); + $M_j$ is the $j$-th column of $M$; + $Out_i$ is the $i$-th column of Output(Out); + $Bias_i$ is a column vector, each element of it is equal to + the $i$-th element of $Bias$; + +)DOC"); + } +}; + +class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto weight_dims = ctx->GetInputDim("Weight"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(out_dims.size(), 2UL, + "The input(Out@GRAD) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ( + x_dims[0], out_dims[0], + "The first dimension(batch_size) of input(Out@GRAD) must be " + "equal to the first dimension of the Input(X)."); + PADDLE_ENFORCE_EQ( + weight_dims[0], out_dims[1], + "The second dimension of input(Out@GRAD) must be equal to " + "the third dimension of the Input(Weight)."); + + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ( + bias_dims[1], out_dims[1], + "The second dimension of input(Out@GRAD) must be equal to " + "the second dimension of the Input(Bias)."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + auto weight_grad_name = framework::GradVarName("Weight"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + if (ctx->HasOutput(weight_grad_name)) { + ctx->SetOutputDim(weight_grad_name, weight_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp, + ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad, + ops::BilinearTensorProductOpGrad); +REGISTER_OP_CPU_KERNEL( + bilinear_tensor_product, + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CPU_KERNEL( + bilinear_tensor_product_grad, + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2cec48ee69ac500c0b0ba84f4b6fc20415f4b82c --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/bilinear_tensor_product_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + bilinear_tensor_product, + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CUDA_KERNEL( + bilinear_tensor_product_grad, + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h new file mode 100644 index 0000000000000000000000000000000000000000..626fa957c42c02c978519c1869cd5f0679d22a26 --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.h @@ -0,0 +1,185 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +class BilinearTensorProductKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto y_mat = EigenMatrix::From(*y); + auto output_mat = EigenMatrix::From(*out); + + auto batch_size = x->dims()[0]; + auto weight_dims = weight->dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + + // Create the intermediate variable to caculate the result of + // Input(X) multiplied by Input(Weight_i), the formula is: + // left_mul = X Weight_i. + Tensor left_mul; + left_mul.mutable_data(framework::make_ddim({batch_size, y_dim}), + ctx.GetPlace()); + auto left_mul_mat = EigenMatrix::From(left_mul); + + for (int i = 0; i < out_dim; ++i) { + auto output_col_vec = output_mat.chip(i, 1); + Tensor weight_mat = + weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); + math::gemm(dev_ctx, CblasNoTrans, CblasNoTrans, + batch_size, y_dim, x_dim, 1, x->data(), + weight_mat.data(), 0, left_mul.data()); + output_col_vec.device(place) = + (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); + } + if (bias) { + auto bias_vec = EigenMatrix::From(*bias); + Eigen::DSizes bcast(batch_size, 1); + output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat; + } + } +}; + +template +class BilinearTensorProductGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* weight = ctx.Input("Weight"); + Tensor* d_x = ctx.Output(framework::GradVarName("X")); + Tensor* d_y = ctx.Output(framework::GradVarName("Y")); + Tensor* d_weight = ctx.Output(framework::GradVarName("Weight")); + Tensor* d_bias = ctx.Output(framework::GradVarName("Bias")); + const Tensor* d_out = ctx.Input(framework::GradVarName("Out")); + + auto batch_size = x->dims()[0]; + auto weight_dims = weight->dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + + auto x_mat = EigenMatrix::From(*x); + auto y_mat = EigenMatrix::From(*y); + auto d_out_mat = EigenMatrix::From(*d_out); + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + // Create the intermediate variable to caculate the Output(Y@Grad). + Tensor x_scale; + x_scale.mutable_data(framework::make_ddim({batch_size, x_dim}), + ctx.GetPlace()); + auto x_scale_mat = EigenMatrix::From(x_scale); + + // Create the intermediate variable to caculate the Output(X@Grad). + Tensor y_scale; + y_scale.mutable_data(framework::make_ddim({batch_size, y_dim}), + ctx.GetPlace()); + auto y_scale_mat = EigenMatrix::From(y_scale); + + math::SetConstant set_zero; + + // Set Output(X@Grad) be zero. + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, d_x, static_cast(0)); + } + + // Set Output(Y@Grad) be zero. + if (d_y) { + d_y->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, d_y, static_cast(0)); + } + + // Caculate the Output(X@Grad) and Output(Y@Grad). + if (d_x || d_y) { + Eigen::DSizes bcast_for_x(1, y_dim); + Eigen::DSizes bcast_for_y(1, x_dim); + for (int i = 0; i < out_dim; ++i) { + Tensor weight_i = weight->Slice(i, i + 1).Resize( + framework::make_ddim({x_dim, y_dim})); + auto output_vec = d_out_mat.chip(i, 1); + if (d_x) { + y_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_x) * + y_mat; + math::gemm( + dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, + y_scale.data(), weight_i.data(), 1, d_x->data()); + } + if (d_y) { + x_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_y) * + x_mat; + math::gemm( + dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, + x_scale.data(), weight_i.data(), 1, d_y->data()); + } + } + } + + // Caculate the gradient of Input(Weight). + if (d_weight) { + d_weight->mutable_data(ctx.GetPlace()); + Eigen::DSizes bcast_for_weight(1, x_dim); + for (int i = 0; i < out_dim; ++i) { + Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize( + framework::make_ddim({x_dim, y_dim})); + auto output_vec = d_out_mat.chip(i, 1); + x_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_weight) * + x_mat; + math::gemm(dev_ctx, CblasTrans, CblasNoTrans, x_dim, + y_dim, batch_size, 1, x_scale.data(), + y->data(), 0, d_weight_i.data()); + } + } + + // Caculate the gradient of Input(Bias). + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + auto d_bias_mat = framework::EigenVector::Flatten(*d_bias); + d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d614bf704382da7743b2128fa57a147e8db33d24 --- /dev/null +++ b/paddle/fluid/operators/bipartite_match_op.cc @@ -0,0 +1,195 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class BipartiteMatchOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("DistMat"), + "Input(DistMat) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchIndices"), + "Output(ColToRowMatchIndices) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchDist"), + "Output(ColToRowMatchDist) of BipartiteMatch should not be null."); + + auto dims = ctx->GetInputDim("DistMat"); + PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); + + ctx->SetOutputDim("ColToRowMatchIndices", dims); + ctx->SetOutputDim("ColToRowMatchDist", dims); + } +}; + +template +class BipartiteMatchKernel : public framework::OpKernel { + public: + // The match_indices must be initialized to -1 at first. + // The match_dist must be initialized to 0 at first. + void BipartiteMatch(const Tensor& dist, int* match_indices, + T* match_dist) const { + constexpr T kEPS = static_cast(1e-6); + PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2."); + int64_t row = dist.dims()[0]; + int64_t col = dist.dims()[1]; + auto* dist_data = dist.data(); + std::vector row_pool; + for (int i = 0; i < row; ++i) { + row_pool.push_back(i); + } + while (row_pool.size() > 0) { + int max_idx = -1; + int max_row_idx = -1; + T max_dist = -1; + for (int64_t j = 0; j < col; ++j) { + if (match_indices[j] != -1) { + continue; + } + for (size_t k = 0; k < row_pool.size(); ++k) { + int m = row_pool[k]; + // distance is 0 between m-th row and j-th column + if (dist_data[m * col + j] < kEPS) { + continue; + } + if (dist_data[m * col + j] > max_dist) { + max_idx = j; + max_row_idx = m; + max_dist = dist_data[m * col + j]; + } + } + } + if (max_idx == -1) { + // Cannot find good match. + break; + } else { + PADDLE_ENFORCE_EQ(match_indices[max_idx], -1); + match_indices[max_idx] = max_row_idx; + match_dist[max_idx] = max_dist; + // Erase the row index. + row_pool.erase( + std::find(row_pool.begin(), row_pool.end(), max_row_idx)); + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* dist_mat = context.Input("DistMat"); + auto* match_indices = context.Output("ColToRowMatchIndices"); + auto* match_dist = context.Output("ColToRowMatchDist"); + + auto& dev_ctx = context.device_context(); + + auto col = dist_mat->dims()[1]; + + int64_t n = dist_mat->lod().size() == 0UL + ? 1 + : static_cast(dist_mat->lod().back().size() - 1); + if (dist_mat->lod().size()) { + PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + match_indices->mutable_data({n, col}, context.GetPlace()); + match_dist->mutable_data({n, col}, context.GetPlace()); + + math::SetConstant iset; + iset(dev_ctx, match_indices, static_cast(-1)); + math::SetConstant tset; + tset(dev_ctx, match_dist, static_cast(0)); + + int* indices = match_indices->data(); + T* dist = match_dist->data(); + if (n == 1) { + BipartiteMatch(*dist_mat, indices, dist); + } else { + auto lod = dist_mat->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]); + BipartiteMatch(one_ins, indices + i * col, dist + i * col); + } + } + } +}; + +class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "DistMat", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[K, M]. It is pair-wise distance matrix between the entities " + "represented by each row and each column. For example, assumed one " + "entity is A with shape [K], another entity is B with shape [M]. The " + "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " + "the distance is, the better macthing the pairs are. Please note, " + "This tensor can contain LoD information to represent a batch of " + "inputs. One instance of this batch can contain different numbers of " + "entities."); + AddOutput("ColToRowMatchIndices", + "(Tensor) A 2-D Tensor with shape [N, M] in int type. " + "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it " + "means B[j] does not match any entity in i-th instance. " + "Otherwise, it means B[j] is matched to row " + "ColToRowMatchIndices[i][j] in i-th instance. The row number of " + "i-th instance is saved in ColToRowMatchIndices[i][j]."); + AddOutput("ColToRowMatchDist", + "(Tensor) A 2-D Tensor with shape [N, M] in float type. " + "N is batch size. If ColToRowMatchIndices[i][j] is -1, " + "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed " + "ColToRowMatchIndices[i][j] = d, and the row offsets of each " + "instance are called LoD. Then " + "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]"); + AddComment(R"DOC( +This operator is a greedy bipartite matching algorithm, which is used to +obtain the matching with the maximum distance based on the input +distance matrix. For input 2D matrix, the bipartite matching algorithm can +find the matched column for each row, also can find the matched row for +each column. And this operator only calculate matched indices from column +to row. For each instance, the number of matched indices is the number of +of columns of the input ditance matrix. + +There are two outputs to save matched indices and distance. +A simple description, this algothrim matched the best (maximum distance) +row entity to the column entity and the matched indices are not duplicated +in each row of ColToRowMatchIndices. If the column entity is not matched +any row entity, set -1 in ColToRowMatchIndices. + +Please note that the input DistMat can be LoDTensor (with LoD) or Tensor. +If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. +If Tensor, the height of ColToRowMatchIndices is 1. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp, + ops::BipartiteMatchOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel, + ops::BipartiteMatchKernel); diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e0fee22d8d828978fe74bd48e46ea6c8063150d --- /dev/null +++ b/paddle/fluid/operators/box_coder_op.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/box_coder_op.h" + +namespace paddle { +namespace operators { + +class BoxCoderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), + "Output(OutputBox) of BoxCoderOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBoxVar must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + + GetBoxCodeType(ctx->Attrs().Get("code_type")); + + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); + } +}; + +class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor) " + "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " + "of variance."); + AddInput( + "TargetBox", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the box if the input " + "is image feature map, they are close to the origin of the coordinate " + "system. [xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddAttr("code_type", + "(string, default encode_center_size) " + "the code type used with the target box") + .SetDefault("encode_center_size") + .InEnum({"encode_center_size", "decode_center_size"}); + AddOutput( + "OutputBox", + "(LoDTensor or Tensor) " + "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] " + "representing the result of N target boxes encoded/decoded with " + "M Prior boxes and variances."); + + AddComment(R"DOC( +Bounding Box Coder Operator. +Encode/Decode the target bounding box with the priorbox information. +The Encoding schema described below: +ox = (tx - px) / pw / pxv +oy = (ty - py) / ph / pyv +ow = log(abs(tw / pw)) / pwv +oh = log(abs(th / ph)) / phv +The Decoding schema described below: +ox = (pw * pxv * tx * + px) - tw / 2 +oy = (ph * pyv * ty * + py) - th / 2 +ow = exp(pwv * tw) * pw + tw / 2 +oh = exp(phv * th) * ph + th / 2 +where tx, ty, tw, th denote the target box's center coordinates, width and +height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) +center coordinates, width and height. pxv, pyv, pwv, phv denote the variance +of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates, +width and height. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); +REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..dd9299ceacdf2507f51f895c71041c1645dd8371 --- /dev/null +++ b/paddle/fluid/operators/box_coder_op.cu @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/box_coder_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void EncodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_center_x = + (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / + 2; + T target_box_center_y = (target_box_data[row_idx * len + 3] + + target_box_data[row_idx * len + 1]) / + 2; + T target_box_width = + target_box_data[row_idx * len + 2] - target_box_data[row_idx * len]; + T target_box_height = + target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1]; + + output[idx * len] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[col_idx * len]; + output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / + prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / + prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / + prior_box_var_data[col_idx * len + 3]; + } +} + +template +__global__ void DecodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + target_box_data[row_idx * len + 2]) * + prior_box_width; + T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_data[row_idx * len + 3]) * + prior_box_height; + T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_data[row_idx * len] * + prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_data[row_idx * len + 1] * + prior_box_height + + prior_box_center_y; + + output[idx * len] = target_box_center_x - target_box_width / 2; + output[idx * len + 1] = target_box_center_y - target_box_height / 2; + output[idx * len + 2] = target_box_center_x + target_box_width / 2; + output[idx * len + 3] = target_box_center_y + target_box_height / 2; + } +} + +template +class BoxCoderCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + int block = 512; + int grid = (row * col + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T* prior_box_data = prior_box->data(); + const T* prior_box_var_data = prior_box_var->data(); + const T* target_box_data = target_box->data(); + + output_box->mutable_data({row, col, len}, context.GetPlace()); + T* output = output_box->data(); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c41bcc212b8fcfc4a274a53db4b25161ecdb3fe5 --- /dev/null +++ b/paddle/fluid/operators/box_coder_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; + +inline BoxCodeType GetBoxCodeType(const std::string& type) { + if (type == "encode_center_size") { + return BoxCodeType::kEncodeCenterSize; + } else if (type == "decode_center_size") { + return BoxCodeType::kDecodeCenterSize; + } + PADDLE_THROW("Not support type %s.", type); +} + +template +class BoxCoderKernel : public framework::OpKernel { + public: + void EncodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + T target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + T target_box_width = + target_box_data[i * len + 2] - target_box_data[i * len]; + T target_box_height = + target_box_data[i * len + 3] - target_box_data[i * len + 1]; + + size_t offset = i * col * len + j * len; + output[offset] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[j * len]; + output[offset + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / prior_box_var_data[j * len + 1]; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)) / + prior_box_var_data[j * len + 2]; + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)) / + prior_box_var_data[j * len + 3]; + } + } + } + void DecodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = prior_box_var_data[j * len] * + target_box_data[i * len] * prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_data[i * len + 1] * + prior_box_height + + prior_box_center_y; + T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_data[i * len + 2]) * + prior_box_width; + T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_data[i * len + 3]) * + prior_box_height; + + size_t offset = i * col * len + j * len; + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = target_box_center_x + target_box_width / 2; + output[offset + 3] = target_box_center_y + target_box_height / 2; + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + + output_box->mutable_data({row, col, len}, context.GetPlace()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + T* output = output_box->data(); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..364c21f7619910784d63047f3abb3713f1bfd0fc --- /dev/null +++ b/paddle/fluid/operators/cast_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of cast op"); + AddOutput("Out", "The output tensor of cast op"); + AddAttr("out_dtype", "output data type"); + AddAttr("in_dtype", "input data type"); + AddComment(R"DOC( +Cast Operator. + +This Operator casts the input tensor to another data type and +returns tha Output Tensor. + +)DOC"); + } +}; + +class CastOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set"); + PADDLE_ENFORCE(context->HasOutput("Out"), + "The output of cast op must be set"); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class CastOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad = new framework::OpDesc(); + grad->SetType("cast"); + grad->SetInput("X", OutputGrad("Out")); + grad->SetOutput("Out", InputGrad("X")); + grad->SetAttr("out_dtype", GetAttr("in_dtype")); + grad->SetAttr("in_dtype", GetAttr("out_dtype")); + return std::unique_ptr(grad); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; +REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, + ops::CastOpProtoMaker); +REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..fb597be9d93af12afb608bf87382c283ddf78e7c --- /dev/null +++ b/paddle/fluid/operators/cast_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cast_op.h" + +template +using CastOpKernel = + paddle::operators::CastOpKernel; + +REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel, + CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9ab4961cef4bd6e7d4e592581b51d7d4eb896ec7 --- /dev/null +++ b/paddle/fluid/operators/cast_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct CastOpTransformFunctor { + HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } +}; + +template +struct CastOpFunctor { + const framework::Tensor* in_; + framework::Tensor* out_; + const DeviceContext& ctx_; + CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, + const DeviceContext& ctx) + : in_(in), out_(out), ctx_(ctx) {} + + template + void operator()() const { + auto* in_begin = in_->data(); + auto numel = in_->numel(); + auto* in_end = in_begin + numel; + auto* out_begin = out_->mutable_data(ctx_.GetPlace()); + platform::Transform trans; + trans(ctx_, in_begin, in_end, out_begin, + CastOpTransformFunctor()); + } +}; + +template +class CastOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + framework::VisitDataType( + static_cast(context.Attr("out_dtype")), + CastOpFunctor( + in, out, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..080e4d80da4752a0c6bea86c0a9f503cf46e8878 --- /dev/null +++ b/paddle/fluid/operators/chunk_eval_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/chunk_eval_op.h" + +namespace paddle { +namespace operators { + +class ChunkEvalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Inference"), + "Input(Inference) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Precision"), + "Output(Precision) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Recall"), + "Output(Recall) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("F1-Score"), + "Output(F1-Score) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"), + "Output(NumInferChunks) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"), + "Output(NumLabelChunks) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NumCorrectChunks"), + "Output(NumCorrectChunks) of ChunkEvalOp should not be null."); + + auto inference_dim = ctx->GetInputDim("Inference"); + auto label_dim = ctx->GetInputDim("Label"); + + PADDLE_ENFORCE(inference_dim == label_dim, + "Inference's shape must be the same as Label's shape."); + + ctx->SetOutputDim("Precision", {1}); + ctx->SetOutputDim("Recall", {1}); + ctx->SetOutputDim("F1-Score", {1}); + ctx->SetOutputDim("NumInferChunks", {1}); + ctx->SetOutputDim("NumLabelChunks", {1}); + ctx->SetOutputDim("NumCorrectChunks", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(framework::proto::DataType::FP32, + platform::CPUPlace()); + } +}; + +class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Inference", + "(Tensor, default: Tensor). " + "Predictions from the network."); + AddInput("Label", + "(Tensor, default: Tensor). The true tag sequences."); + AddOutput("Precision", + "(float). The evaluated precision (called positive predictive " + "value) of chunks on the given mini-batch."); + AddOutput("Recall", + "(float). The evaluated recall (true positive rate or " + "sensitivity) of chunks on the given mini-batch."); + AddOutput("F1-Score", + "(float). The evaluated F1-Score on the given mini-batch."); + AddOutput("NumInferChunks", + "(int64_t). The number of chunks in Inference on the given " + "mini-batch."); + AddOutput( + "NumLabelChunks", + "(int64_t). The number of chunks in Label on the given mini-batch."); + AddOutput( + "NumCorrectChunks", + "(int64_t). The number of chunks both in Inference and Label on the " + "given mini-batch."); + AddAttr("num_chunk_types", + "(int). The number of chunk type. See below for details."); + AddAttr( + "chunk_scheme", + "(string, default IOB). The labeling scheme indicating " + "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below " + "for details.") + .SetDefault("IOB"); + AddAttr>("excluded_chunk_types", + "(list) A list including chunk type ids " + "indicating chunk types that are not counted. " + "See below for details.") + .SetDefault(std::vector{}); + AddComment(R"DOC( +For some basics of chunking, please refer to +‘Chunking with Support Vector Machines ’. + + +CheckEvalOp computes the precision, recall, and F1-score of chunk detection, +and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. +Here is a NER example of labeling for these tagging schemes: + + Li Ming works at Agricultural Bank of China in Beijing. + IO: I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC + IOB: B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC + IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC + IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC + +There are three chunk types(named entity types) including PER(person), ORG(organization) +and LOC(LOCATION), and we can see that the labels have the form -. + +Since the calculations actually use label ids rather than labels, extra attention +should be paid when mapping labels to ids to make CheckEvalOp work. The key point +is that the listed equations are satisfied by ids. + + tag_type = label % num_tag_type + chunk_type = label / num_tag_type + +where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type` +is the num of chunk types, and `tag_type` get its value from the following table. + + Scheme Begin Inside End Single + plain 0 - - - + IOB 0 1 - - + IOE - 0 1 - + IOBES 0 1 2 3 + +Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, +PER and LOC. To satisfy the above equations, the label map can be like this: + + B-ORG 0 + I-ORG 1 + B-PER 2 + I-PER 3 + B-LOC 4 + I-LOC 5 + O 6 + +It’s not hard to verify the equations noting that the num of chunk types +is 3 and the num of tag types in IOB scheme is 2. For example, the label +id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of +I-LOC is 2, which consistent with the results from the equations. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp, + ops::ChunkEvalOpMaker); +REGISTER_OP_CPU_KERNEL(chunk_eval, + ops::ChunkEvalKernel); diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3dca3d2c0f99c3b2d447ffe4516c1b6c379b13f2 --- /dev/null +++ b/paddle/fluid/operators/chunk_eval_op.h @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class ChunkEvalKernel : public framework::OpKernel { + public: + struct Segment { + int begin; + int end; + int type; + bool operator==(const Segment& y) const { + return begin == y.begin && end == y.end && type == y.type; + } + }; + + void GetSegments(const int64_t* label, int length, + std::vector& segments, int num_chunk_types, + int num_tag_types, int other_chunk_type, int tag_begin, + int tag_inside, int tag_end, int tag_single) const { + segments.clear(); + segments.reserve(length); + int chunk_start = 0; + bool in_chunk = false; + int tag = -1; + int type = other_chunk_type; + for (int i = 0; i < length; ++i) { + int prev_tag = tag; + int prev_type = type; + PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types); + tag = label[i] % num_tag_types; + type = label[i] / num_tag_types; + if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + Segment segment{ + chunk_start, // begin + i - 1, // end + prev_type, + }; + segments.push_back(segment); + in_chunk = false; + } + if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + chunk_start = i; + in_chunk = true; + } + } + if (in_chunk) { + Segment segment{ + chunk_start, // begin + length - 1, // end + type, + }; + segments.push_back(segment); + } + } + + bool ChunkEnd(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return false; + if (type == other_chunk_type) return true; + if (type != prev_type) return true; + if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_end) return true; + if (prev_tag == tag_single) return true; + return false; + } + + bool ChunkBegin(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return type != other_chunk_type; + if (type == other_chunk_type) return false; + if (type != prev_type) return true; + if (tag == tag_begin) return true; + if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_single) return true; + return false; + } + + void Compute(const framework::ExecutionContext& context) const override { + // initialize to parse configurations + int num_chunk_types, num_tag_types; + int other_chunk_type; + int tag_begin, tag_inside, tag_end, tag_single; + std::vector label_segments; + std::vector output_segments; + std::set excluded_chunk_types; + + if (context.Attr("chunk_scheme") == "IOB") { + num_tag_types = 2; + tag_begin = 0; + tag_inside = 1; + tag_end = -1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOE") { + num_tag_types = 2; + tag_begin = -1; + tag_inside = 0; + tag_end = 1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOBES") { + num_tag_types = 4; + tag_begin = 0; + tag_inside = 1; + tag_end = 2; + tag_single = 3; + } else if (context.Attr("chunk_scheme") == "plain") { + num_tag_types = 1; + tag_begin = -1; + tag_inside = -1; + tag_end = -1; + tag_single = -1; + } else { + PADDLE_THROW("Unknown chunk scheme."); + } + other_chunk_type = num_chunk_types = context.Attr("num_chunk_types"); + excluded_chunk_types.insert( + context.Attr>("excluded_chunk_types").begin(), + context.Attr>("excluded_chunk_types").end()); + + auto* inference = context.Input("Inference"); + auto place = inference->place(); + auto* label = context.Input("Label"); + auto* precision = context.Output("Precision"); + auto* recall = context.Output("Recall"); + auto* f1 = context.Output("F1-Score"); + auto* num_infer_chunks = context.Output("NumInferChunks"); + auto* num_label_chunks = context.Output("NumLabelChunks"); + auto* num_correct_chunks = context.Output("NumCorrectChunks"); + + const int64_t* inference_data = inference->data(); + const int64_t* label_data = label->data(); + T* precision_data = precision->mutable_data(place); + T* racall_data = recall->mutable_data(place); + T* f1_data = f1->mutable_data(place); + int64_t* num_infer_chunks_data = + num_infer_chunks->mutable_data(place); + int64_t* num_label_chunks_data = + num_label_chunks->mutable_data(place); + int64_t* num_correct_chunks_data = + num_correct_chunks->mutable_data(place); + *num_infer_chunks_data = 0; + *num_label_chunks_data = 0; + *num_correct_chunks_data = 0; + + auto lod = label->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE(lod == inference->lod(), + "LoD must be same between Inference and Label."); + int num_sequences = lod[0].size() - 1; + for (int i = 0; i < num_sequences; ++i) { + int seq_length = lod[0][i + 1] - lod[0][i]; + EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length, + output_segments, label_segments, *num_infer_chunks_data, + *num_label_chunks_data, *num_correct_chunks_data, + num_chunk_types, num_tag_types, other_chunk_type, tag_begin, + tag_inside, tag_end, tag_single, excluded_chunk_types); + } + *precision_data = !(*num_infer_chunks_data) + ? 0 + : static_cast(*num_correct_chunks_data) / + (*num_infer_chunks_data); + *racall_data = !(*num_label_chunks_data) + ? 0 + : static_cast(*num_correct_chunks_data) / + (*num_label_chunks_data); + *f1_data = !(*num_correct_chunks_data) + ? 0 + : 2 * (*precision_data) * (*racall_data) / + ((*precision_data) + (*racall_data)); + } + + void EvalOneSeq(const int64_t* output, const int64_t* label, int length, + std::vector& output_segments, + std::vector& label_segments, + int64_t& num_output_segments, int64_t& num_label_segments, + int64_t& num_correct, int num_chunk_types, int num_tag_types, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single, + const std::set& excluded_chunk_types) const { + GetSegments(output, length, output_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + GetSegments(label, length, label_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + size_t i = 0, j = 0; + while (i < output_segments.size() && j < label_segments.size()) { + if (output_segments[i] == label_segments[j] && + excluded_chunk_types.count(output_segments[i].type) != 1) { + ++num_correct; + } + if (output_segments[i].end < label_segments[j].end) { + ++i; + } else if (output_segments[i].end > label_segments[j].end) { + ++j; + } else { + ++i; + ++j; + } + } + for (auto& segment : label_segments) { + if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments; + } + for (auto& segment : output_segments) { + if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..89df118c06f4df6444fc1f61b5ddde48f6ad8ba7 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +class ClipByNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipByNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipByNormOp should not be null."); + auto max_norm = ctx->Attrs().Get("max_norm"); + PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input of clip_by_norm op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", + "(Tensor) The output of clip_by_norm op with shape as input(X)"); + AddAttr("max_norm", "(float) The maximum norm value."); + AddComment(R"DOC( +ClipByNorm Operator. + +This operator limits the L2 norm of the input $X$ within $max\_norm$. +If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be +the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will +be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as +shown in the following formula: + +$$ +Out = \frac{max\_norm * X}{norm(X)}, +$$ + +where $norm(X)$ represents the L2 norm of $X$. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, + ops::ClipByNormOpMaker); +REGISTER_OP_CPU_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a466b335914f1fde6865c6cc375f4ef009632e41 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..82bcf07657bfbf1df6b541b3953285622fb25a87 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class ClipByNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max_norm = context.Attr("max_norm"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input); + auto out = EigenVector::Flatten(*output); + auto x_norm = x.square().sum().sqrt(); + auto& place = + *context.template device_context().eigen_device(); + + auto temp = (x_norm <= max_norm).template cast().eval(); + auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm; + Eigen::array one_dim{{1}}; + Eigen::DSizes m_dsize(input->numel()); + out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..76b2cefbf9dd67c5036b6ecfe35a9d53a54467a9 --- /dev/null +++ b/paddle/fluid/operators/clip_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_op.h" + +namespace paddle { +namespace operators { + +class ClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto max = ctx->Attrs().Get("max"); + auto min = ctx->Attrs().Get("min"); + PADDLE_ENFORCE_LT(min, max, "max should be greater than min."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class ClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor)The input of clip op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)"); + AddAttr( + "min", "(float)Minimum value, under which element is replaced by min."); + AddAttr( + "max", "(float)Maximum value, above which element is replaced by max"); + AddComment(R"DOC( +Clip Operator. + +The clip operator limits the value of given input within an interval. The +interval is specified with arguments 'min' and 'max': + +$$ +Out = \min(\max(X, min), max) +$$ + +)DOC"); + } +}; + +class ClipOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad, + ops::ClipOpGrad); +REGISTER_OP_CPU_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CPU_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7b044d6e699d59dda04fa19468c17faf1e0a0eb7 --- /dev/null +++ b/paddle/fluid/operators/clip_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CUDA_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aecd6f83bfaf4deab4271e859bea10feecacab62 --- /dev/null +++ b/paddle/fluid/operators/clip_op.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::Transform; + +template +class ClipFunctor { + public: + explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x) const { + if (x < min_) + return min_; + else if (x > max_) + return max_; + else + return x; + } + + private: + T min_; + T max_; +}; + +template +class ClipGradFunctor { + public: + explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x, const T& y) const { + return (y > min_ && y < max_) ? x : 0; + } + + private: + T min_; + T max_; +}; + +template +class ClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max = context.Attr("max"); + auto min = context.Attr("min"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + T* out_data = out->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + int64_t numel = x->numel(); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); + } +}; + +template +class ClipGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max = context.Attr("max"); + auto min = context.Attr("min"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + auto* x = context.Input("X"); + int64_t numel = d_out->numel(); + auto* d_x_data = d_x->mutable_data(context.GetPlace()); + const T* d_out_data = d_out->data(); + const T* x_data = x->data(); + Transform trans; + trans(context.template device_context(), d_out_data, + d_out_data + numel, x_data, d_x_data, ClipGradFunctor(min, max)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3414c33b5ab3cc8dffee640fd85b9625b3f237b --- /dev/null +++ b/paddle/fluid/operators/compare_op.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + OpComment comment; + AddInput("X", + string::Sprintf("(LoDTensor) the left hand operand of %s operator", + comment.type)); + AddInput("Y", string::Sprintf( + "(LoDTensor) the right hand operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X and Y, and returns the Out. Each of them is a +N-dim tensor. X and Y could be any type. The each element of the Out tensor is +calculated by %s +)DOC", + comment.type, comment.equation)); + AddAttr("axis", + "(int, default -1). The start dimension index " + "for broadcasting Y onto X.") + .SetDefault(-1) + .EqualGreaterThan(-1); + } +}; + +template +class CompareOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X", + comment.type); + PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y", + comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(), + "The size of dim_y should not be greater than dim_x's."); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class CompareOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // CompareOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::CompareOp, \ + ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_LOGICAL_OP(less_than, "Out = X < Y"); +REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y"); +REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_OP(equal, "Out = X == Y"); +REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor); diff --git a/paddle/fluid/operators/compare_op.cu b/paddle/fluid/operators/compare_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..3507af2ae3add8cf02f5b9f3b3d89b40d73bcb0d --- /dev/null +++ b/paddle/fluid/operators/compare_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/compare_op.h" + +REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); diff --git a/paddle/fluid/operators/compare_op.h b/paddle/fluid/operators/compare_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4b2ee5a9d68f5f1fd3d2d374669763855659f1db --- /dev/null +++ b/paddle/fluid/operators/compare_op.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct LessThanFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; } +}; + +template +struct LessEqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; } +}; + +template +struct EqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + if (std::is_floating_point::value) { + // This branch will be optimized while compiling if T is integer. It is + // safe to cast a and b to double. + return fabs(static_cast(a - b)) < 1e-8; + } else { + return (a == b); + } + } +}; + +template +class CompareOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); + int axis = context.Attr("axis"); + ElementwiseComputeEx(context, x, y, axis, + Functor(), z); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..68eb5412beb02d9dc948eeb188a2d5b1cdb0c5b3 --- /dev/null +++ b/paddle/fluid/operators/concat_op.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include + +namespace paddle { +namespace operators { +using framework::Tensor; + +class ConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + "Inputs(X) of ConcatOp should be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ConcatOp should not be null."); + + auto ins = ctx->GetInputsDim("X"); + size_t axis = static_cast(ctx->Attrs().Get("axis")); + const size_t n = ins.size(); + + PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1."); + + auto out_dims = ins[0]; + size_t in_zero_dims_size = out_dims.size(); + for (size_t i = 1; i < n; i++) { + for (size_t j = 0; j < in_zero_dims_size; j++) { + if (j == axis) { + out_dims[axis] += ins[i][j]; + } else { + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } + } + } + if (out_dims[axis] < 0) { + out_dims[axis] = -1; + } + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input tensors of concat operator.").AsDuplicable(); + AddOutput("Out", "Output tensor of concat operator."); + AddAttr("axis", + "The axis along which the input tensors will be concatenated.") + .SetDefault(0); + AddComment(R"DOC( +Concat Operator. + +Concatenate the input tensors along dimension axis. +Examples: + Input[0] = [[1,2],[3,4]] + Input[1] = [[5,6]] + axis = 0 + Output = [[1,2], + [3,4], + [5,6]] + +)DOC"); + } +}; + +class ConcatOpGrad : public framework::OperatorWithKernel { + public: + ConcatOpGrad(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, + ops::ConcatOpGrad, false) +REGISTER_OP_CPU_KERNEL(concat, + ops::ConcatKernel) +REGISTER_OP_CPU_KERNEL(concat_grad, + ops::ConcatGradKernel) diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..143bda6116775611e399ad805708474621d33b96 --- /dev/null +++ b/paddle/fluid/operators/concat_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + concat, ops::ConcatKernel); +REGISTER_OP_CUDA_KERNEL( + concat_grad, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..72b3e225bf64f889804eb5e4fab9df4653f5452b --- /dev/null +++ b/paddle/fluid/operators/concat_op.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +template +class ConcatKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + int64_t axis = static_cast(ctx.Attr("axis")); + const size_t n = ins.size(); + size_t output_offset = 0; + out->mutable_data(ctx.GetPlace()); + auto out_stride = framework::stride(out->dims()); + for (size_t i = 0; i < n; i++) { + auto& in = ins[i]; + auto axis_dim = in->dims()[axis]; + auto in_stride = framework::stride(in->dims()); + StridedMemcpy(ctx.device_context(), in->data(), in_stride, + in->dims(), out_stride, out->data() + output_offset); + output_offset += axis_dim * in_stride[axis]; + } + } +}; + +template +class ConcatGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input(framework::GradVarName("Out")); + auto outs = ctx.MultiOutput(framework::GradVarName("X")); + int64_t axis = static_cast(ctx.Attr("axis")); + const size_t n = outs.size(); + size_t input_offset = 0; + auto in_stride = framework::stride(in->dims()); + for (size_t i = 0; i < n; i++) { + auto& out = outs[i]; + out->mutable_data(ctx.GetPlace()); + size_t axis_dim = out->dims()[axis]; + auto out_stride = framework::stride(out->dims()); + StridedMemcpy(ctx.device_context(), in->data() + input_offset, + in_stride, out->dims(), out_stride, out->data()); + input_offset += axis_dim * in_stride[axis]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd93790d5b52a2ccc8358a94f7ead346d384f191 --- /dev/null +++ b/paddle/fluid/operators/cond_op.cc @@ -0,0 +1,235 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cond_op.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/scatter.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +using Scope = framework::Scope; +using Variable = framework::Variable; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DDim = framework::DDim; + +framework::Scope& CondOp::AddSubScope(const Scope& scope) const { + auto sub_scopes_var = scope.FindVar("SubScopes"); + PADDLE_ENFORCE_NOT_NULL(sub_scopes_var, + "Output(SubScopes) of CondOp should not be null."); + auto sub_scopes = sub_scopes_var->GetMutable>(); + auto& sub_scope = scope.NewScope(); + sub_scopes->push_back(&sub_scope); + return sub_scope; +} + +std::vector& CondOp::GetSubScopes( + const framework::Scope& scope) const { + auto sub_scopes_var = scope.FindVar("SubScopes"); + PADDLE_ENFORCE_NOT_NULL(sub_scopes_var, + "Output(SubScopes) of CondOp should not be null."); + return *sub_scopes_var->GetMutable>(); +} + +LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const { + auto index_tensors_var = scope.FindVar("IndexTensors"); + PADDLE_ENFORCE_NOT_NULL(index_tensors_var, + "Output(IndexTensors) of CondOp should not be null."); + auto& index_tensors = + *index_tensors_var->GetMutable>(); + index_tensors.push_back(LoDTensor()); + return index_tensors.back(); +} + +std::vector& CondOp::GetIndexTensors( + const framework::Scope& scope) const { + auto* index_tensors_var = scope.FindVar("IndexTensors"); + PADDLE_ENFORCE_NOT_NULL(index_tensors_var, + "Output(IndexTensors) of CondOp should not be null."); + return *index_tensors_var->GetMutable>(); +} + +void CondOp::PrepareDataForSubnet( + const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const { + PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty."); + + for (int i = 0; i < BRANCH_NUM; ++i) { + // Create two sub scopes for true and false branches + // sub_scopes[0] for the true branch + // sub_scopes[1] for the false branch + AddSubScope(scope); + // Create two tensors for true and false indices: + // index_tensors[0] for the true branch + // index_tensors[1] for the false branch + AddIndexTensor(scope); + } + + Variable* cond_var = scope.FindVar(Input("Cond")); + PADDLE_ENFORCE_NOT_NULL(cond_var, + "Input(Cond) of CondOp should not be null."); + const LoDTensor* cond = cond_var->GetMutable(); + + // get the true/false index at runtime according to cond tensor + // index_vectors[0]: vector, contains all index for cond[i] == true + // index_vectors[1]: vector, contains all index for cond[i] == false + std::vector> index_vectors; + index_vectors.resize(BRANCH_NUM); + + const int* cond_data = cond->data(); + for (int i = 0; i < cond->dims()[0]; ++i) { + if (cond_data[i]) + index_vectors[TRUE_BRANCH].push_back(i); + else + index_vectors[FALSE_BRANCH].push_back(i); + } + + // put index_vectors[0] and index_vectors[1] into two tensors: + // index_tensors[0] and index_tensors[1] + std::vector& index_tensors = GetIndexTensors(scope); + std::vector& sub_scopes = GetSubScopes(scope); + + for (int i = 0; i < BRANCH_NUM; ++i) { + DDim dim = {static_cast(index_vectors[i].size())}; + int* index_tensor_data_ptr = + index_tensors[i].mutable_data(dim, platform::CPUPlace()); + memcpy(index_tensor_data_ptr, index_vectors[i].data(), + dim[0] * sizeof(int)); + } + + // create input in subscopes according to index_vectors + for (auto& input : Inputs("Xs")) { + Variable* var_parent = scope.FindVar(input); + PADDLE_ENFORCE_NOT_NULL(var_parent); + const auto* tensor_parent = &var_parent->Get(); + + for (int i = 0; i < BRANCH_NUM; ++i) { + Variable* var_child = sub_scopes[i]->FindVar(input); + PADDLE_ENFORCE_NOT_NULL(var_child); + auto* tensor_child = var_child->GetMutable(); + + // Resize child + DDim dim = tensor_parent->dims(); + dim[0] = index_tensors[i].dims()[0]; + tensor_child->mutable_data(dim, platform::CPUPlace()); + + CPUGather(dev_ctx, *tensor_parent, index_tensors[i], tensor_child); + } + } + + // create output_tensors in subscope for sub_net + for (int i = 0; i < BRANCH_NUM; ++i) { + for (auto& output : (*sub_net_op_[i]).Outputs()) { + for (auto& var_name : output.second) { + sub_scopes[i]->Var(var_name); + } + } + } +} + +void CondOp::MergeDataFromSubnet(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const { + std::vector& sub_scopes = GetSubScopes(scope); + const std::vector& index_tensors = + GetIndexTensors(scope); + + // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0] + PADDLE_ENFORCE(!Outputs("Outs").empty(), + "Outputs(Outs) of CondOp can't be empty."); + for (auto& output : Outputs("Outs")) { + const LoDTensor* tensor_t_out = + &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get(); + PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL"); + const LoDTensor* tensor_f_out = + &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get(); + PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL"); + + auto* var_out = scope.FindVar(output); + PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found"); + LoDTensor* tensor_out = var_out->GetMutable(); + PADDLE_ENFORCE_NOT_NULL(tensor_t_out, + "True output tensor should not be NULL"); + + DDim true_dim = tensor_t_out->dims(); + DDim false_dim = tensor_f_out->dims(); + true_dim[0] = 0; + false_dim[0] = 0; + PADDLE_ENFORCE_EQ(true_dim, false_dim, + "Outputs not of the same shape except the first dim"); + + DDim out_dim = tensor_t_out->dims(); + out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0]; + tensor_out->Resize(out_dim); + tensor_out->mutable_data(platform::CPUPlace()); + } + + // merge output results: + // output_tensor = true_output_tensor + false_output_tensor + for (auto& output : Outputs("Outs")) { + Variable* var_parent = scope.FindVar(output); + PADDLE_ENFORCE_NOT_NULL(var_parent); + auto* tensor_parent = var_parent->GetMutable(); + + for (int i = 0; i < BRANCH_NUM; ++i) { + Variable* var_child = sub_scopes[i]->FindVar(output); + PADDLE_ENFORCE_NOT_NULL(var_child); + auto* tensor_child = &var_child->Get(); + ScatterAssign(dev_ctx, *tensor_child, index_tensors[i], + tensor_parent); + } + } +} + +void CondOp::Run(const Scope& scope, const platform::Place& place) const { + // get device context from pool + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); + + PrepareDataForSubnet(scope, dev_ctx); + std::vector& sub_scopes = GetSubScopes(scope); + for (int i = 0; i < BRANCH_NUM; ++i) { + sub_net_op_[i]->Run(*sub_scopes[i], place); + } + MergeDataFromSubnet(scope, dev_ctx); +} + +class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker { + public: + CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Cond", "The condition, which is a bool vector"); + AddInput("Xs", "Inputs of Subnets").AsDuplicable(); + AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable(); + + AddOutput("SubScopes", "sub scopes for true and false branches"); + AddOutput("IndexTensors", "Index Tensors contains indices for true/false"); + + AddComment(R"DOC( +Sample Dependent Conditional Operator. + +Given Cond[i] as a 1/0 vector to indicate true/false: +Out[i] = subnet_true[i], if Cond[i] == true +Out[i] = subnet_false[i], if Cond[i] == false + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp, + paddle::operators::CondOpProtoAndCheckerMaker); diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h new file mode 100644 index 0000000000000000000000000000000000000000..695af4490696b29d2d47f5825ebc0159b39663c0 --- /dev/null +++ b/paddle/fluid/operators/cond_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +/* + * @brief CondOp is a dynamic if-else Operator + * + * It has a input tensor named cond indicating which netop each instance will + * run. + * + * if cond == 1, it will run true_net, which is a NetOp. + * + * if cond == 0, it will run false_net, which is another NetOp. + */ +class CondOp : public framework::OperatorBase { + public: + CondOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) { + sub_net_op_.resize(BRANCH_NUM); + } + + CondOp(const CondOp& o) + : framework::OperatorBase( + static_cast(o)) { + // TODO(yuyang18): Implement copy ctor well. + PADDLE_THROW("Not implemented"); + } + + framework::Scope& AddSubScope(const framework::Scope& scope) const; + std::vector& GetSubScopes( + const framework::Scope& scope) const; + + framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const; + std::vector& GetIndexTensors( + const framework::Scope& scope) const; + + void PrepareDataForSubnet(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + void MergeDataFromSubnet(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + + /* + * Set True Block + */ + void set_truenet(std::unique_ptr&& net) { + sub_net_op_[TRUE_BRANCH] = std::move(net); + } + + /* + * Set False Block + */ + void set_falsenet(std::unique_ptr&& net) { + sub_net_op_[FALSE_BRANCH] = std::move(net); + } + + void Run(const framework::Scope& scope, + const platform::Place& place) const override; + + private: + const int TRUE_BRANCH = 0; + const int FALSE_BRANCH = 1; + const int BRANCH_NUM = 2; + + // sub_net_op_[0]: subnet_t + // sub_net_op_[1]: subnet_f + std::vector> sub_net_op_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..30435c6cca0a4fb1d41dce47b8fefeafb6c48a51 --- /dev/null +++ b/paddle/fluid/operators/conditional_block_op.cc @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ConditionalOp : public framework::OperatorBase { + public: + ConditionalOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + std::vector InputTensors( + const framework::Scope &scope) const { + std::vector retv; + auto xs = Inputs("X"); + retv.resize(xs.size(), nullptr); + std::transform( + xs.begin(), xs.end(), retv.begin(), + [&scope](const std::string &var_name) -> const framework::LoDTensor * { + auto *var = scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name); + return &var->Get(); + }); + return retv; + } + + bool ScalarCondition( + const std::vector &ips) const { + if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { + PADDLE_THROW("should have one initialized input as condition"); + } + if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() && + ips[0]->numel() == 1)) { + PADDLE_THROW( + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel()); + } + return ips[0]->data()[0]; + } +}; + +class ConditionalBlockOp : public ConditionalOp { + public: + ConditionalBlockOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto xs = InputTensors(scope); + + bool need_run; + if (Attr("is_scalar_condition")) { + need_run = ScalarCondition(xs); + } else { + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Output("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto *scopes = scope_var->GetMutable>(); + scopes->resize(1); + scopes->front() = &scope.NewScope(); + auto &cur_scope = *scopes->front(); + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + exec.Run(*block->Program(), &cur_scope, block->ID(), false); + } + } +}; + +class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The conditional variable of this operator. If X is empty, the " + "whole sub-block will not be executed.") + .AsDuplicable(); + AddInput("Params", "The input variables of the sub-block.").AsDuplicable(); + AddOutput("Out", "The output variables of the sub-block.").AsDuplicable(); + AddOutput("Scope", + "(std::vector) The step scope of conditional block. To " + "unify the conditional block, rnn and while op, the type of " + "scope is std::vector"); + AddAttr( + "sub_block", "The step block of conditional block operator"); + AddAttr("is_scalar_condition", + "the input X is used as scalar " + "condition") + .SetDefault(false); + AddComment(R"DOC(Conditional block operator + +Run the sub-block if X is not empty. Params is the other inputs and Out is the +outputs of the sub-block. +)DOC"); + } +}; + +class ConditionalBlockGradOp : public ConditionalOp { + public: + ConditionalBlockGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto xs = this->InputTensors(scope); + + bool need_run; + if (Attr("is_scalar_condition")) { + need_run = ScalarCondition(xs); + } else { + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Input("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto &scopes = scope_var->Get>(); + framework::Scope &cur_scope = *scopes[0]; + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + exec.Run(*block->Program(), &cur_scope, block->ID(), false); + + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"), + Outputs(framework::GradVarName("Params"))); + + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"), + Outputs(framework::GradVarName("X"))); + } + } + + private: + void AssignLocalGradientToGlobal( + const platform::Place &place, const framework::Scope &cur_scope, + const std::vector &p_names, + const std::vector &pg_names) const { + for (size_t i = 0; i < p_names.size(); ++i) { + auto out_grad_name = pg_names[i]; + auto in_grad_name = framework::GradVarName(p_names[i]); + auto *in_var = cur_scope.FindVar(in_grad_name); + if (in_var == nullptr) { + continue; + } + auto new_in_grad_name = cur_scope.Rename(in_grad_name); + auto assign = framework::OpRegistry::CreateOp( + "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, + framework::AttributeMap{}); + assign->Run(cur_scope, place); + cur_scope.Rename(new_in_grad_name, in_grad_name); + } + } +}; + +class ConditionalBlockGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInputs("X")); + if (context->HasInputs("Params")) { + PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params"))); + context->SetOutputsDim(framework::GradVarName("Params"), + context->GetInputsDim("Params")); + } + PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X"))); + context->SetOutputsDim(framework::GradVarName("X"), + context->GetInputsDim("X")); + } +}; + +class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad_op = new framework::OpDesc(); + grad_op->SetType("conditional_block_grad"); + grad_op->SetInput("X", Input("X")); + grad_op->SetInput("Params", Input("Params")); + grad_op->SetInput("Out", Output("Out")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetInput("Scope", Output("Scope")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + grad_op->SetOutput(framework::GradVarName("Params"), + InputGrad("Params", false)); + grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]); + grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp, + ops::ConditionalBlockOpProtoMaker, + ops::ConditionalBlockGradMaker); +REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp, + ops::ConditionalBlockGradInferShape); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a729d376ac8c3dc49ec06271c3ffef6406a20b28 --- /dev/null +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; + +static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = + static_cast(1024) * 1024 * 1024; + +template +class CUDNNConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + +#if CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it mannually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + groups = 1; +#endif + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims()), groups); + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims()), groups); + + int input_channels = input->dims()[1]; + int input_height, input_width, input_depth; + if (input->dims().size() == 5) { + input_depth = input->dims()[2]; + input_height = input->dims()[3]; + input_width = input->dims()[4]; + } else { // dim size is enforced in InferShape + input_depth = 1; + input_height = input->dims()[2]; + input_width = input->dims()[3]; + } + int output_channels = filter->dims()[0]; + int output_height, output_width, output_depth; + if (output->dims().size() == 5) { + output_depth = output->dims()[2]; + output_height = output->dims()[3]; + output_width = output->dims()[4]; + } else { + output_depth = 1; + output_height = output->dims()[2]; + output_width = output->dims()[3]; + } + + int group_offset_in = + input_channels / groups * input_height * input_width * input_depth; + int group_offset_out = + output_channels / groups * output_height * output_width * output_depth; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn conv workspace --------------------- + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionFwdAlgo_t algo; + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + // get workspace size able to allocate + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + // Allocate on GPU memory + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv forward --------------------- + T alpha = 1.0f, beta = 0.0f; + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +template +class CUDNNConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_grad_desc; + + ScopedFilterDescriptor filter_desc; + ScopedFilterDescriptor filter_grad_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + +#if CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it mannually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + groups = 1; +#endif + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_grad_desc = + output_grad_desc.descriptor( + layout, framework::vectorize2int(output_grad->dims()), groups); + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims()), groups); + + int input_channels = input->dims()[1]; + int input_height, input_width, input_depth; + if (input->dims().size() == 5) { + input_depth = input->dims()[2]; + input_height = input->dims()[3]; + input_width = input->dims()[4]; + } else { // dim size is enforced in InferShape + input_depth = 1; + input_height = input->dims()[2]; + input_width = input->dims()[3]; + } + + int output_grad_channels = filter->dims()[0]; + int output_grad_height, output_grad_width, output_grad_depth; + if (input->dims().size() == 5) { + output_grad_depth = output_grad->dims()[2]; + output_grad_height = output_grad->dims()[3]; + output_grad_width = output_grad->dims()[4]; + } else { + output_grad_depth = 1; + output_grad_height = output_grad->dims()[2]; + output_grad_width = output_grad->dims()[3]; + } + + int group_offset_in = + input_channels / groups * input_height * input_width * input_depth; + int group_offset_out = output_grad_channels / groups * output_grad_height * + output_grad_width * output_grad_depth; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t workspace_size_in_bytes = 0, tmp_size = 0; + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + if (input_grad) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + + if (filter_grad) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + // ------------------- cudnn conv workspace --------------------- + // Already on GPU + void* cudnn_workspace = nullptr; + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- + T alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + i * group_offset_in)); + } + } + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset filter_grad. + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_output_grad_desc, output_grad_data + i * group_offset_out, + cudnn_conv_desc, filter_algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_filter_desc, + filter_grad_data + i * group_offset_filter)); + } + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); + +REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a047e579163cfe9cd0d053f337b7a92339466a96 --- /dev/null +++ b/paddle/fluid/operators/conv_op.cc @@ -0,0 +1,354 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" + +namespace paddle { +namespace operators { + +void ConvOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = ctx->Attrs().Get>("dilations"); + + PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, + "Conv intput should be 4-D or 5-D tensor."); + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + "Conv input dimension and filter dimension should be the same."); + PADDLE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "Conv input dimension and strides dimension should be consistent."); + PADDLE_ENFORCE_EQ( + paddings.size(), strides.size(), + "Conv paddings dimension and Conv strides dimension should be the same."); + + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + "The number of output channels should be divided by groups."); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] - + (dilations[i] * (filter_dims[i + 2] - 1) + 1) > + 0, + "Due to the settings of paddings, filter_dims and " + "dilations, the output size is less than 0, please check " + "again."); + output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); + ctx->ShareLoD("Input", "Output"); +} + +framework::OpKernelType ConvOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the feature, " + "and W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "H is the height of the filter, and W is the width of the filter. " + "If the groups attribute is greater than 1, C equals the number of " + "input image channels divided by the groups."); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0}), the " + "paddings(h_pad, w_pad) of " + "convolution operator.") + .SetDefault({0, 0}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. Need set use_cudnn to true." + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + AddComment(R"DOC( +Convolution Operator. + +The convolution operation calculates the output based on the input, filter +and strides, paddings, dilations, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and Output(Output) are in NCHW format. Where N is batch +size, C is the number of channels, H is the height of the feature, and W is +the width of the feature. +Filters(Input) is MCHW format. Where M is the number of output image channels, C is +the number of input image channels, H is the height of the filter, and W +is the width of the filter. +Parameters(strides, paddings, dilations) are two elements. These two elements represent +height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where +$$ + H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 +$$ +)DOC"); +} + +Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution operator. " + "The format of input tensor is NCDHW. Where N is batch size, C is the " + "number of channels, D is the depth of the feature, H is the height of " + "the feature, " + "and W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution operator. " + "The format of the filter tensor is MCDHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "D is the depth of the filter, H is the height of the filter, and W " + "is the width of the filter." + "If the groups attribute is greater than 1, C equals the number of " + "input image channels divided by the groups."); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator." + "The format of output tensor is also NCDHW."); + AddAttr>("strides", + "(vector, default:{1, 1, 1}), the " + "strides(d_stride, h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1, 1}); + AddAttr>("paddings", + "(vector, default:{0, 0, 0}), the " + "paddings(d_pad, h_pad, w_pad) of convolution " + "operator.") + .SetDefault({0, 0, 0}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation, h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + + AddComment(R"DOC( +Convolution3D Operator. + +The convolution operation calculates the output based on the input, filter +and strides, paddings, dilations, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCDHW format, where N is batch +size, C is the number of channels,D is the depth of the feature, H is the height of +the feature, and W is the width of the feature. +Filters(Input) is MCDHW format, where M is the number of output image channels, +C is the number of input image channels, D is the depth of the filter, +H is the height of the filter, and W is the width of the filter. +Parameters(strides, paddings, dilations) are three elements. These three elements +represent depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\ + H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1 + $$ +)DOC"); +} + +void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +framework::OpKernelType ConvOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, + ops::ConvOpGrad); + +// depthwise convolution op +REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + depthwise_conv2d_grad, ops::ConvOpGrad); +REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, + ops::ConvOpGrad); + +// depthwise conv kernel +// TODO(xingzhaolong): neon kernel for mobile +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d, + ops::GemmConvKernel, + ops::GemmConvKernel); + +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CPU_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CPU_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2129d3b461249b5d1b317edde924ffc04f4f90f --- /dev/null +++ b/paddle/fluid/operators/conv_op.cu.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d, + ops::DepthwiseConvKernel, + ops::DepthwiseConvKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad, + ops::DepthwiseConvGradKernel, + ops::DepthwiseConvGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1156e6c8fe3263607d4dcd1af0c9996acd9368fb --- /dev/null +++ b/paddle/fluid/operators/conv_op.h @@ -0,0 +1,422 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// Base convolution operator definations for other conv +// like operators to reuse the implementation. +inline int OutputSize(int input_size, int filter_size, int dilation, + int padding, int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + const int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + return output_size; +} +inline bool IsExpand(std::vector& filter_dim, + std::vector& strides, std::vector& paddings, + std::vector& dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +// Define Op classes in .h file so that other conv +// operator implementations can reuse the code. +class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class ConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +template +class GemmConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * + // o_h * o_w) + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape, context.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + auto& dev_ctx = context.template device_context(); + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(dev_ctx, in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::matmul(dev_ctx, filter_slice, false, col_matrix, + false, T(1.0), &out_slice, T(0.0)); + } + } + } +}; + +template +class GemmConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + // The filter and filter_grad will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + int groups = context.Attr("groups"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) + // or + // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + framework::DDim output_matrix_shape = { + output_grad->dims()[1], + output_grad->numel() / + (output_grad->dims()[0] * output_grad->dims()[1])}; + + // convolution backward input operator: gemm + col2im(or col2vol) + // convolution backward weight operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output_grad->dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape, context.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + + // if is_expand is false, the operation of set_zero is unnecessary, + // because math::matmul will reset input_grad. + if (is_expand) { + set_zero(dev_ctx, input_grad, static_cast(0)); + } + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + + for (int i = 0; i < batch_size; i++) { + Tensor out_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + Tensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + Tensor in_grad_slice = + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col_matrix.ShareDataWith(in_grad_slice); + col_matrix.Resize(col_matrix_shape); + } + math::matmul(dev_ctx, filter_slice, true, + out_grad_slice, false, T(1.0), + &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &in_grad_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); + } + } + } + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; i++) { + Tensor out_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // im2col + Tensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor filter_grad_slice = + filter_grad_.Slice(g * out_step, (g + 1) * out_step); + math::matmul(dev_ctx, out_grad_slice, false, + col_matrix, true, T(1.0), + &filter_grad_slice, T(1.0)); + } + } + } + } +}; + +template +class DepthwiseConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_EQ( + output->dims()[1] % input->dims()[1], 0, + "The output channels must be a multiple of the input channels"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::DepthwiseConvFunctor depthwiseConv; + + auto& dev_ctx = context.template device_context(); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + } +}; + +template +class DepthwiseConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, input_grad); + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, + filter_grad); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a96aac63e09b47c9afe99b2e622a718839ba047c --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_shift_op.h" +#include "paddle/fluid/framework/eigen.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +class ConvShiftOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], + "The 1st dimension of Input(X) and Input(Y) should " + "be equal."); + PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1, + "The 2nd dimension of Input(Y) should be odd."); + PADDLE_ENFORCE_LE(y_dims[1], x_dims[1], + "The 2nd dimension of Input(Y) should be less than or " + "equal to the 2nd dimension of Input(X)."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ConvShiftGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + auto y_dims = ctx->GetInputDim("Y"); + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape B x M, " + "where B is the batch size and M is the data dimension."); + AddInput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape B x N, " + "where B is the batch size and N is the data dimension. N must " + "be odd."); + AddOutput("Out", + "(Tensor, default Tensor), a 2-D tensor with shape B x M, " + "i.e., the same shape as X."); + AddComment(R"DOC( +ConvShift Operator. + +A layer for circular convolution of two vectors, +as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401 + +The equation is: + +$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$ + +where X's index is computed modulo M, and Y's index is computed modulo N. + +Both inputs X and Y can carry LoD (Level of Details) information. +However, the output only shares the LoD information with input X. + +)DOC"); + } +}; + +template +class ConvShiftKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Y = context.Input("Y"); + auto *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = EigenMatrix::From(*X); + auto y = EigenMatrix::From(*Y); + auto out = EigenMatrix::From(*Out); + out.setZero(); + + size_t batch_size = X->dims()[0]; + size_t x_width = X->dims()[1]; + size_t y_width = Y->dims()[1]; + size_t y_half_width = (y_width - 1) / 2; + + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + out(k, i) += x(k, index) * y(k, j); + } + } + } + } +}; + +template +class ConvShiftGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Y = context.Input("Y"); + auto *dOut = context.Input(framework::GradVarName("Out")); + auto *dX = context.Output(framework::GradVarName("X")); + auto *dY = context.Output(framework::GradVarName("Y")); + + auto x = EigenMatrix::From(*X); + auto y = EigenMatrix::From(*Y); + auto dout = EigenMatrix::From(*dOut); + + auto x_dims = X->dims(); + auto y_dims = Y->dims(); + size_t batch_size = x_dims[0]; + size_t x_width = x_dims[1]; + size_t y_width = y_dims[1]; + size_t y_half_width = (y_width - 1) / 2; + + // The below trades code duplication for efficiency (keeping the if + // statement outside of the loop). + if (dX) { + dX->mutable_data(context.GetPlace()); + auto dx = EigenMatrix::From(*dX); + dx.setZero(); + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + dx(k, index) += dout(k, i) * y(k, j); + } + } + } + } + + if (dY) { + dY->mutable_data(context.GetPlace()); + auto dy = EigenMatrix::From(*dY); + dy.setZero(); + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + dy(k, j) += x(k, index) * dout(k, i); + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker, + conv_shift_grad, ops::ConvShiftGradOp); +REGISTER_OP_CPU_KERNEL(conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CPU_KERNEL( + conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9818707ce3b98afe25050336d85e3b05919620f3 --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -0,0 +1,197 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_shift_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Some notes on the design: +// +// Each thread is responsible for computing a single output out[k, i]. +// Thread blocks are based on tiles of x with height 1 in the batch dimension. +// +// This design is based on the typical use case where the filter +// y is fairly small. For large y, it would probably be more efficient +// to also tile across y. +template +__global__ void ConvShiftForward(const T *x, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *out) { + extern __shared__ T mem[]; + + int tx = threadIdx.x; + int i = blockIdx.x * blockDim.x + tx; // global x index + int k = blockIdx.y; // batch index + + // Check if we are in a boundary block with fewer x's to process than + // blockDim.x. + int num_x = + (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x; + + T *sx = mem; + T *sx_pad = &mem[num_x]; + T *sy = &mem[blockDim.x + y_width]; + + // Collaboratively load y[k, :] and length-y padding of x into shared memory. + int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width; + for (int j = tx; j < y_width; j += blockDim.x) { + sy[j] = y[k * y_width + j]; + sx_pad[j] = x[k * x_width + (pad_start + j) % x_width]; + } + + // Load a cyclically shifted slice of x into shared memory. + if (tx < num_x) { + int load_i = (i - y_half_width + x_width) % x_width; + sx[tx] = x[k * x_width + load_i]; + } + __syncthreads(); + + if (tx < num_x) { + // Compute dot product of sx[tx:tx + y_width] and sy. + T sum = 0; + for (int j = 0; j < y_width; ++j) { + sum += sx[tx + j] * sy[j]; + } + + // Save to out[k, i]. + out[k * x_width + i] = sum; + } +} + +// Compute x gradient - initial naive implementation with atomic add. +template +__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *dx) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // x index + int j = blockIdx.y; // y index + int k = blockIdx.z; // batch index + + if (i < x_width) { + int index = (i + j - y_half_width + x_width) % x_width; + atomicAdd(&dx[k * x_width + index], + dout[k * x_width + i] * y[k * y_width + j]); + } +} + +// Compute y gradient - initial naive implementation with atomic add. +template +__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, + int y_half_width, int batch_size, T *dy) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // x index + int j = blockIdx.y; // y index + int k = blockIdx.z; // batch index + + if (i < x_width) { + int index = (i + j - y_half_width + x_width) % x_width; + atomicAdd(&dy[k * y_width + j], + x[k * x_width + index] * dout[k * x_width + i]); + } +} +} // namespace + +template +class ConvShiftKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Y = context.Input("Y"); + Tensor *Out = context.Output("Out"); + const T *x_data = X->data(); + const T *y_data = Y->data(); + T *out_data = Out->mutable_data(context.GetPlace()); + + int batch_size = X->dims()[0]; + int x_width = X->dims()[1]; + int y_width = Y->dims()[1]; + int y_half_width = (y_width - 1) / 2; + + const int x_per_block = 256; + int num_x_blocks = DivUp(x_width, x_per_block); + int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); + + dim3 grid_dim(num_x_blocks, batch_size); + + auto stream = + context.template device_context().stream(); + + ConvShiftForward<<>>( + x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); + } +}; + +template +class ConvShiftGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Y = context.Input("Y"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + const T *x_data = X->data(); + const T *y_data = Y->data(); + const T *dout_data = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dY = context.Output(framework::GradVarName("Y")); + + int batch_size = X->dims()[0]; + int x_width = X->dims()[1]; + int y_width = Y->dims()[1]; + int y_half_width = (y_width - 1) / 2; + + auto &device_ctx = + context.template device_context(); + math::SetConstant zero; + + const int x_per_block = 256; + int num_x_blocks = DivUp(x_width, x_per_block); + dim3 grid_dim(num_x_blocks, y_width, batch_size); + + if (dX) { + T *dx_data = dX->mutable_data(context.GetPlace()); + zero(device_ctx, dX, static_cast(0.0)); + ConvShiftGradX<<>>( + dout_data, y_data, x_width, y_width, y_half_width, batch_size, + dx_data); + } + if (dY) { + T *dy_data = dY->mutable_data(context.GetPlace()); + zero(device_ctx, dY, static_cast(0.0)); + ConvShiftDy<<>>( + x_data, dout_data, x_width, y_width, y_half_width, batch_size, + dy_data); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL( + conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h new file mode 100644 index 0000000000000000000000000000000000000000..987a690895e2a7428f058eb2d8366f9c7572912b --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ConvShiftKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class ConvShiftGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..0aed4ebeffa7312c218bb892fbcdf9cd9cdc53ca --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -0,0 +1,251 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; + +static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024; + +template +class CUDNNConvTransposeOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + // cudnn v5 does not support dilations + std::vector dilations = ctx.Attr>("dilations"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + // (N, M, H, W) or (N, M, D, H, W) + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims())); + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + // ------------------- cudnn conv workspace --------------------- + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t algo; + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + // Get the algorithm + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + + // get workspace size able to allocate + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + + // Allocate on GPU memory + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + + // ------------------- cudnn conv transpose forward --------------------- + T alpha = 1.0f, beta = 0.0f; + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc, + input_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +template +class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + // cudnn v5 does not support dilations + std::vector dilations = ctx.Attr>("dilations"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + // Input: (N, M, H, W) or (N, M, D, H, W) + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output_grad->dims())); + // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims())); + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionFwdAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t bwd_filter_ws_size, fwd_ws_size; + size_t workspace_size_in_bytes = 0; + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + if (input_grad) { + // choose backward algorithm for data + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, data_algo, &fwd_ws_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); + } + + if (filter_grad) { + // choose backward algorithm for filter + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + // get workspace for backwards filter algorithm + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); + workspace_size_in_bytes = + std::max(workspace_size_in_bytes, bwd_filter_ws_size); + } + + // ------------------- cudnn conv workspace --------------------- + // Already on GPU + void* cudnn_workspace = nullptr; + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- + // FIXME(typhoonzero): template type T may not be the same as cudnn call. + T alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, output_grad_data, + cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data)); + } + + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset filter_grad. + // Gradient with respect to the filter + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, + input_data, cudnn_conv_desc, filter_algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data)); + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); + +REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..974cffad92871c1a855c86a7a2e56f8e65819428 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -0,0 +1,323 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_transpose_op.h" + +namespace paddle { +namespace operators { + +void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvTransposeOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = ctx->Attrs().Get>("dilations"); + + PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], + "In ConvTransposeOp, The input channel should be the same " + "as the number of filters."); + + std::vector output_shape({in_dims[0], filter_dims[1]}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + + filter_extent); + } + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); +} + +framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution transpose operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of input channels, H is the height of the feature, and " + "W is the width of the feature."); + AddInput( + "Filter", + "(Tensor) The filter tensor of convolution transpose operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels," + "H is the height of the filter, and W is the width of the filter. " + "We enforce groups number == 1 in the convolution transpose scenario."); + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator. " + "The format of output tensor is also NCHW."); + + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1}); + AddAttr>( + "strides", + "(vector default:{1, 1}), the strides(h_stride, w_stride) of " + "convolution transpose operator.") + .SetDefault({1, 1}); + AddAttr>( + "paddings", + "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution " + "transpose operator.") + .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + AddComment(R"DOC( +Convolution2D Transpose Operator. + +The convolution transpose operation calculates the output based on the input, filter +and dilations, strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the +number of channels, H is the height of the feature, and W is the width of the feature. +Filter(Input) is in MCHW format. Where M is the number of input feature channels, +C is the number of output feature channels, H is the height of the filter, +and W is the width of the filter. +Parameters(strides, paddings) are two elements. These two elements represent height +and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + $$ + H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\ + W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 + $$ +)DOC"); +} + +Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor) The input tensor of convolution transpose operator." + "The format of input tensor is NCDHW. Where N is batch size, C is " + "the number of channels, D is the depth of the feature, H is the " + "height of the feature, and " + "W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution transpose operator." + "The format of the filter tensor is MCDHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels, D " + "is the depth of the filter, H is the height of the filter, and " + "W is the width of the filter." + "We enforce groups number == 1 and padding == 0 in " + "the convolution3d transpose scenario."); + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator." + "The format of output tensor is also NCDHW." + "Where N is batch size, C is " + "the number of channels, D is the depth of the feature, H is the " + "height of the feature, and W is the width of the feature."); + + AddAttr>( + "dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation,h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1, 1}); + AddAttr>("strides", + "(vector default:{1, 1, 1}), the " + "strides{d_stride, h_stride, w_stride} of " + "convolution transpose operator.") + .SetDefault({1, 1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0, 0}), paddings(d_pad, " + "h_pad, w_pad) of convolution transpose operator.") + .SetDefault({0, 0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + AddComment(R"DOC( +Convolution3D Transpose Operator. + +The convolution transpose operation calculates the output based on the input, filter +and dilations, strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the +number of channels, D is the depth of the feature, H is the height of the feature, +and W is the width of the feature. +Filter(Input) is in MCDHW format. Where M is the number of input feature channels, +C is the number of output feature channels, D is the depth of the filter,H is the +height of the filter, and W is the width of the filter. +Parameters(strides, paddings) are three elements. These three elements represent +depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\ + H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\ + W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 + $$ +)DOC"); +} + +void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, + conv2d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, + conv3d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv3d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed90c6ec6265cc914a172b6c7217a204981e7fd1 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.cu.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_transpose_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv3d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f512575468626edfb3e36c007e26b05faff0a06d --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -0,0 +1,291 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +// Define Op classes in .h file so that other conv transpose +// operator implementations can reuse the code. +class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class ConvTransposeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvTransposeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +template +class GemmConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped, so it should not be constant pointer + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + // groups will alway be disabled in conv2dtranspose. + + const int batch_size = static_cast(input->dims()[0]); + + // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} + std::vector input_shape_vec = framework::vectorize(input->dims()); + // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w} + std::vector filter_shape_vec = framework::vectorize(filter.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = output->dims()[1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; + } + DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); + + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + DDim output_shape = + framework::slice_ddim(output->dims(), 1, output->dims().size()); + + // input matrix size: (m, h * w) or (m, d * h * w) + DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; + + // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; + filter.Resize(filter_matrix_shape); + + output->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, output, static_cast(0)); + + math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; + + // convolution transpose: gemm + col2im or col2vol (similar to conv-backward + // on input) + for (int i = 0; i < batch_size; i++) { + // batch with size (m, h * w) or (m, d * h * w) + Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); + + // col_matrix = filter * input_batch + // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + math::matmul(dev_ctx, filter, true, input_batch, false, + static_cast(1.0), &col_matrix, + static_cast(0.0)); + + if (data_dim == 2U) { + // col2im: col_matrix -> dy + // from (c * k_h * k_w, h * w) to (c, o_h, o_w) + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &output_batch); + } else if (data_dim == 3U) { + // col2vol: col_matrix -> dy + // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) + col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch); + } + } + } +}; + +template +class GemmConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + // For filter, we do not use const pointer b/c we will do reshape, + // but we should avoid modifying its value. + Tensor filter = *context.Input("Filter"); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + + if ((!input_grad) && (!filter_grad)) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} + std::vector input_shape_vec = framework::vectorize(input->dims()); + // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w} + std::vector filter_shape_vec = framework::vectorize(filter.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = output_grad->dims()[1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; + } + DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + DDim output_shape = framework::slice_ddim(output_grad->dims(), 1, + output_grad->dims().size()); + + // input matrix size: (m, h * w) or (m, d * h * w) + DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; + + // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; + filter.Resize(filter_matrix_shape); + + // convolution transpose grad on input: + // im2col + gemm (similar to conv-forward) + // input need to compute gradient + auto& dev_ctx = context.template device_context(); + if (input_grad || filter_grad) { + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + Tensor filter_grad_; + math::SetConstant set_zero; + + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + } + if (filter_grad) { // filter size (m, c, k_h, k_w) + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + } + + for (int i = 0; i < batch_size; i++) { + // batch with size (c, o_h * o_w) + Tensor output_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_shape); + + if (data_dim == 2U) { + // im2col: dy -> col matrix + // from (c, o_h, o_w) to (c * k_h * k_w, h * w) + im2col(dev_ctx, output_grad_batch, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col: dy -> col_matrix + // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) + vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, + &col); + } + + if (input_grad) { + // batch with size (m, h, w) + Tensor input_grad_batch = + input_grad->Slice(i, i + 1).Resize(input_matrix_shape); + // gemm: dx = filter * dy + // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w) + // or + // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, + // d, h, w) + math::matmul( + dev_ctx, filter, false, col_matrix, false, static_cast(1.0), + &input_grad_batch, static_cast(0.0)); + } + if (filter_grad) { + // input batch + Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + // gemm: d_filter = x * dy^T + // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w) + // or + // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * + // k_h * k_w) + math::matmul(dev_ctx, in_batch, false, col_matrix, + true, static_cast(1.0), + &filter_grad_, static_cast(1.0)); + } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..57c5a6025a03fbafadb56a3dbec9c4cfab5e979a --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cos_sim_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CosSimOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // notnull check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("XNorm"), + "Output(XNorm) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("YNorm"), + "Output(YNorm) of CosSimOp should not be null."); + + // shape check + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), + "Ranks of Input(X) and Input(Y) must be equal."); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) must not be less than 2."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()), + framework::slice_ddim(y_dims, 1, y_dims.size()), + "All dimensions except the 1st of Input(X) and Input(Y) " + "must be equal."); + PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1, + "The 1st dimension of Input(Y) must be equal to Input(X) or" + " just 1 (which will be broadcasted to match Input(X))."); + + // resize tensor + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->SetOutputDim("XNorm", {x_dims[0], 1}); + ctx->SetOutputDim("YNorm", {y_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The 1st input of cos_sim op."); + AddInput("Y", "The 2nd input of cos_sim op."); + AddOutput("Out", "The output of cos_sim op."); + AddOutput("XNorm", + "Norm of the first input, reduced along the 1st " + "dimension.") + .AsIntermediate(); + AddOutput("YNorm", + "Norm of the second input, reduced along the 1st " + "dimension.") + .AsIntermediate(); + + AddComment(R"DOC( +Cosine Similarity Operator. + +$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$ + +The input X and Y must have the same shape, except that the 1st dimension +of input Y could be just 1 (different from input X), which will be +broadcasted to match the shape of input X before computing their cosine +similarity. + +Both the input X and Y can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CosSimOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // notnull check + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) must not be null."); + + // shape check + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto xnorm_dims = ctx->GetInputDim("XNorm"); + auto ynorm_dims = ctx->GetInputDim("YNorm"); + auto out_dims = ctx->GetInputDim("Out"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Ranks of Input(X) and Input(Y) must be equal."); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) must not be less than 2."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()), + framework::slice_ddim(y_dims, 1, y_dims.size()), + "All dimensions except the 1st of Input(X) and Input(Y) " + "must be equal."); + PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1, + "The 1st dimension of Input(Y) must be equal to Input(X) or" + " just 1 (which will be broadcasted to match Input(X))."); + auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1}); + auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1}); + PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims, + "Shape of Input(XNorm) must be [X.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims, + "Shape of Input(YNorm) must be [Y.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims, + "Shape of Input(Out) must be [X.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims, + "Shape of Input(Out@Grad) must be [X.Dim(0), 1]."); + + // resize tensor + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad, + ops::CosSimOpGrad); +REGISTER_OP_CPU_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..c8cf363cdc4009bd8fa233a52435dcc6ea56cf3c --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/cos_sim_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9cd8b196daf6e4afe9bde4d91db0110430cd7324 --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.h @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cos_sim_functor.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CosSimKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get Tensor + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* out_z = context.Output("Out"); + auto* out_x_norm = context.Output("XNorm"); + auto* out_y_norm = context.Output("YNorm"); + out_z->mutable_data(context.GetPlace()); + out_x_norm->mutable_data(context.GetPlace()); + out_y_norm->mutable_data(context.GetPlace()); + + int rows_x = in_x->dims()[0]; + int rows_y = in_y->dims()[0]; + + int cols = framework::product(in_x->dims()) / rows_x; + + if (rows_x == rows_y) { + math::CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + platform::ForRange for_range( + static_cast(context.device_context()), rows_x); + for_range(functor); + } else { + math::CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + platform::ForRange for_range( + static_cast(context.device_context()), rows_x); + for_range(functor); + } + } +}; + +template +class CosSimGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get Tensor + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* in_z = context.Input("Out"); + auto* in_x_norm = context.Input("XNorm"); + auto* in_y_norm = context.Input("YNorm"); + auto* out_grad_x = context.Output(framework::GradVarName("X")); + auto* out_grad_y = context.Output(framework::GradVarName("Y")); + auto* in_grad_z = context.Input(framework::GradVarName("Out")); + + // compute gradident + int rows_x = in_x->dims()[0]; + int rows_y = in_y->dims()[0]; + int cols = framework::product(in_x->dims()) / rows_x; + + if (rows_x == rows_y) { + if (out_grad_x) { + math::CosSimGradFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + if (out_grad_y) { + math::CosSimGradFunctor functor( + in_y_norm->data(), in_x_norm->data(), in_y->data(), + in_x->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + } else { + if (out_grad_x) { + math::CosSimDxFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + if (out_grad_y) { + out_grad_y->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, out_grad_y, static_cast(0)); + + math::CosSimDyFunctor functor; + functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), + in_x->data(), in_y->data(), in_z->data(), + in_grad_z->data(), static_cast(rows_x), + static_cast(cols), out_grad_y->data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1ba51f2c0f13a1b6e4d7ccb93c912703a0b1d86 --- /dev/null +++ b/paddle/fluid/operators/create_reader_op.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { + +static std::vector RestoreShapes( + const std::vector& shape_concat, const std::vector& ranks) { + std::vector res; + int offset = 0; + for (int len : ranks) { + auto start_it = shape_concat.begin() + offset; + auto end_it = start_it + len; + res.push_back(framework::make_ddim(std::vector(start_it, end_it))); + offset += len; + } + return res; +} + +// general infershape for file readers +class CreateFileReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output file reader should not be null."); + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + if (ctx->IsRuntime()) { + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ( + lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } + } +}; + +// general infershape for decorated readers +class CreateDecoratedReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), + "Input(UnderlyingReader) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output decorated reader should not be null."); + ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); + + if (ctx->IsRuntime()) { + framework::VarDesc* in_reader = boost::get( + ctx->GetInputVarPtrs("UnderlyingReader")[0]); + framework::VarDesc* out_reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + out_reader->SetLoDLevels(in_reader->GetLoDLevels()); + } + } +}; + +// general var type inference for file readers +class CreateFileReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + reader->SetType(framework::proto::VarDesc::READER); + } +}; + +// general var type inference for decorated readers +class CreateDecoratedReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string in_reader_name = op_desc.Input("UnderlyingReader")[0]; + framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name); + std::string out_reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name); + out_reader->SetType(framework::proto::VarDesc::READER); + out_reader->SetDataTypes(in_reader->GetDataTypes()); + } +}; + +template +class CreateRandomDataGeneratorOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& shape_concat = Attr>("shape_concat"); + const auto& ranks = Attr>("ranks"); + PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); + PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), + int(shape_concat.size()), + "The accumulate of all ranks should be equal to the " + "shape concat's length."); + std::vector shapes = RestoreShapes(shape_concat, ranks); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::RandomDataGenerator(shapes, Attr("min"), + Attr("max"))); + } +}; + +class CreateRandomDataGeneratorOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddOutput("Out", "(ReaderHolder) The created random reader."); + AddAttr>("shape_concat", + "The concat of all data's shapes."); + AddAttr>( + "ranks", + "The ranks of each data." + "e.g." + "shape_concat = [2,3,4,5,6]" + "ranks = [3,2]" + "It means the reader will generate two data each time," + "whose shapes are [2,3,4] and [5,6] respectively."); + AddAttr>("lod_levels", "The LoD levels of each data."); + AddAttr("min", "The lower bound of reader's uniform distribution."); + AddAttr("max", "The upper bound of reader's uniform distribution."); + AddComment(R"DOC( + CreateRandomDataGenerator Operator + + This Op creates a random reader. + The reader generates random data instead of really reading from files. + Generated data follow an uniform distribution between 'min' and 'max'. + )DOC"); + } +}; + +class CreateShuffleReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::ShuffleReader(underlying_reader.Get(), + Attr("buffer_size"))); + } +}; + +class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput( + "UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a shuffle reader."); + AddOutput("Out", "(ReaderHolder) The created shuffle reader."); + AddAttr("buffer_size", "The shuffle buffer size.").GreaterThan(0); + AddComment(R"DOC( + CreateShuffleReader Operator + + A shuffle reader takes another reader as its 'underlying reader' + and yields the underlying reader's outputs in a shuffled order. + )DOC"); + } +}; + +class CreateBatchReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::BatchReader(underlying_reader.Get(), + Attr("batch_size"))); + } +}; + +class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput( + "UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a batch reader."); + AddOutput("Out", "(ReaderHolder) The created batch reader."); + AddAttr("batch_size", + "How many instances the batch reader yields each time.") + .GreaterThan(0); + AddComment(R"DOC( + CreateBatchReader Operator + + A batch reader takes another reader as its 'underlying reader', + gathers the underlying reader's outputs and then yields them in batches. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(create_random_data_generator, + ops::CreateRandomDataGeneratorOp, + ops::CreateFileReaderInferShape, + ops::CreateRandomDataGeneratorOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateFileReaderInferVarType); +REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp, + ops::CreateDecoratedReaderInferShape, + ops::CreateShuffleReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateDecoratedReaderInferVarType); +REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp, + ops::CreateDecoratedReaderInferShape, + ops::CreateBatchReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateDecoratedReaderInferVarType); diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3c1fc95a3be574635ab8a99aa29b71bd8dbc71e --- /dev/null +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crf_decoding_op.h" + +namespace paddle { +namespace operators { +class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Emission", + "(LoDTensor, default: LoDTensor). A LoDTensor with shape " + "[N x D] where N is the size of the mini-batch and D is the total " + "tag number. This input is the unscaled emission weight matrix of " + "the linear_chain_crf operator."); + AddInput( + "Transition", + "(Tensor, default: Tensor). A Tensor with shape [(D + 2) x D]. " + "This input is the transition weights learned by the linear_chain_crf " + "operator, denoted as w. The 1st row of w are transition weights for " + "the start mask. The 2nd row of w are transition weights for the end " + "mask. Transition weights between other tags begin from the 3rd row of " + "w. See more details in comments of the linear_chain_crf operator."); + AddInput( + "Label", + "(LoDTensor, LoDTensor). The ground truth with shape " + "[N x 1]. This input is optional. See more details in the operator's " + "comments.") + .AsDispensable(); + AddOutput( + "ViterbiPath", + "(LoDTensor, LoDTensor). The decoding results. What to " + "return changes depending on whether the Input(Label) (the ground " + "truth) is given. See more details in the operator's comment."); + AddComment(R"DOC( +The crf_decoding operator reads the emission feature weights and the transition +feature weights learned by the linear_chain_crf operator. It implements the +Viterbi algorithm which is a dynamic programming algorithm for finding the most +likely sequence of hidden states, called the Viterbi path, that results in a +sequence of observed tags. + +The output of this operator changes according to whether Input(Label) is given: + +1. Input(Label) is given: + +This happens in training. This operator is used to co-work with the chunk_eval +operator. + +When Input(Label) is given, the crf_decoding operator returns a row vector +with shape [N x 1] whose values are fixed to be 0, indicating an incorrect +prediction, or 1 indicating a tag is correctly predicted. Such an output is the +input to chunk_eval operator. + +2. Input(Label) is not given: + +This is the standard decoding process. + +The crf_decoding operator returns a row vector with shape [N x 1] whose values +range from 0 to maximum tag number - 1. Each element indicates an index of a +predicted tag. +)DOC"); + } +}; + +class CRFDecodingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Emission"), + "Input(Emission) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Transition"), + "Input(Transition) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"), + "Output(ViterbiPath) should be not null."); + + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, + "The Input(Transition) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_dims[0] - 2, transition_dims[1], + "An invalid dimension for the Input(Transition), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + PADDLE_ENFORCE_EQ( + emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + + if (ctx->HasInput("Label")) { + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + } + + ctx->ShareLoD("Emission", /*->*/ "ViterbiPath"); + ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Emission")->type()), + platform::CPUPlace()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, + ops::CRFDecodingOpMaker); +REGISTER_OP_CPU_KERNEL( + crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c3c161eec5f541b9f60a36064d7e8c350078c664 --- /dev/null +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using framework::LoDTensor; +using framework::LoD; +using framework::Tensor; + +template +class CRFDecodingOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* emission_weights = ctx.Input("Emission"); + auto* transition_weights = ctx.Input("Transition"); + auto* label = ctx.Input("Label"); + auto* decoded_path = ctx.Output("ViterbiPath"); + + PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + auto lod = emission_weights->lod(); + PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence."); + const size_t level = 0; + const size_t seq_num = lod[level].size() - 1; + + int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); + math::SetConstant()( + ctx.template device_context(), decoded_path, 0); + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); + Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights, + &decoded_path_one_seq); + } + + if (label) { + PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + const int64_t* label_value = label->data(); + size_t batch_size = emission_weights->dims()[0]; + for (size_t i = 0; i < batch_size; ++i) { + path[i] = label_value[i] == path[i] ? 1 : 0; + } + } + } + + private: + void Decode(const Tensor& emission_weights, const Tensor& transition_weights, + Tensor* decoded_path) const { + auto emission_dims = emission_weights.dims(); + const size_t seq_len = emission_dims[0]; + const size_t tag_num = emission_dims[1]; + + const size_t state_trans_base_idx = 2; + + const T* x = emission_weights.data(); + const T* w = transition_weights.data(); + int64_t* path = decoded_path->data(); + + // alpha is a memo table. An element alpha(k, v) records the score of the + // best sequence of tags from position 1 to position k with v being the end + // tag. + Tensor alpha; + T* alpha_value = alpha.mutable_data(emission_dims, platform::CPUPlace()); + Tensor track; + int* track_value = + track.mutable_data(emission_dims, platform::CPUPlace()); + + for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; + + for (size_t k = 1; k < seq_len; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (size_t j = 0; j < tag_num; ++j) { + T score = alpha_value[(k - 1) * tag_num + j] + + w[(j + state_trans_base_idx) * tag_num + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + + alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; + track_value[k * tag_num + i] = max_j; + } + } + + T max_score = -std::numeric_limits::max(); + int max_i = 0; + for (size_t i = 0; i < tag_num; ++i) { + T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; + if (score > max_score) { + max_score = score; + max_i = i; + } + } + path[seq_len - 1] = max_i; + for (int k = seq_len - 1; k >= 1; --k) { + path[k - 1] = max_i = track_value[k * tag_num + max_i]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e80f77e497ee21ccd5322b544376f20cb7de012 --- /dev/null +++ b/paddle/fluid/operators/crop_op.cc @@ -0,0 +1,159 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crop_op.h" +#include + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CropOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of CropOp should not be null."); + auto x_dim = ctx->GetInputDim("X"); + if (!ctx->HasInput("Y")) { + auto shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_EQ( + int64_t(shape.size()), x_dim.size(), + "Shape size should be equal to dimention size of input tensor."); + std::vector tensor_shape(shape.size()); + for (size_t i = 0; i < shape.size(); ++i) { + tensor_shape[i] = static_cast(shape[i]); + } + ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape)); + } else { + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim), + "Tensor rank of both CropOp's " + "inputs must be same."); + ctx->SetOutputDim("Out", y_dim); + } + } +}; + +class CropOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CropOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)."); + AddInput("Y", + "The input used as reference for cropping, " + "which is of the same dimensions as X.") + .AsDispensable(); + AddOutput("Out", + "The output of crop op, " + "which is of the same dimensions as X."); + AddAttr>("offsets", + "A list describing offsets to be cropped. " + "The size of offsets list should be the same as " + "the dimension size of input X."); + AddAttr>("shape", + "A list describing the shape of output. " + "The size of shape list should be the same as " + "the dimension size of input X.") + .SetDefault(std::vector()); + AddComment(R"DOC( +Crop Operator. + +Crop input into output, as specified by offsets and shape. + +There are two ways to set shape: +1. reference input: crop input X into the same shape as reference input. + The dimension of reference input should + be the same as the dimension of input X. +2. shape list: crop input X into the shape described by a list. + The size of shape list should be the same as + the dimension size of input X. + +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Case 1: +Given + + X = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + shape = [2, 2], + +we get: + + Out = [[1, 2], + [3, 4]]. + + +Case 2: +Given + + X = [[0, 1, 2, 5, 0] + [0, 3, 4, 6, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + Y = [[0, 0, 0] + [0, 0, 0]], + +we get: + + Out = [[1, 2, 5], + [3, 4, 6]]. +)DOC"); + } +}; + +class CropOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad); +REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f3610675aae1572380ca9b778ac3251c1951678b --- /dev/null +++ b/paddle/fluid/operators/crop_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/crop_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9c7c0446d4c0baf1ba59eb860a928341eed7cce0 --- /dev/null +++ b/paddle/fluid/operators/crop_op.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { // Internal + +template +using EigenTensor = framework::EigenTensor; +using framework::Tensor; + +template +class CropKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + const T* x_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + auto x_stride = framework::stride(x->dims()); + auto out_stride = framework::stride(out->dims()); + auto offsets = context.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + x->dims().size(), static_cast(offsets.size()), + "Offsets size should be equal to dimension size of input tensor."); + int64_t offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) { + offset += (x_stride[i] * offsets[i]); + } + StridedMemcpy(context.device_context(), x_data + offset, x_stride, + out->dims(), out_stride, out_data); + } +}; + +template +void CropGradFunction(const framework::ExecutionContext& context) { + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + auto* d_out = context.Input(framework::GradVarName("Out")); + d_x->mutable_data(context.GetPlace()); + auto offsets = context.Attr>("offsets"); + Eigen::array, D> paddings; + for (size_t i = 0; i < D; ++i) { + paddings[i].first = offsets[i]; + paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i]; + } + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + d_x_tensor.device( + *context.template device_context().eigen_device()) = + d_out_tensor.pad(paddings, 0); + } +} + +template +class CropGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + CropGradFunction(context); + break; + case 2: + CropGradFunction(context); + break; + case 3: + CropGradFunction(context); + break; + case 4: + CropGradFunction(context); + break; + case 5: + CropGradFunction(context); + break; + case 6: + CropGradFunction(context); + break; + default: + PADDLE_THROW( + "CropOp only support tensors with no more than 6 dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e34b248b6aa696eaa03c7e1b4236a76a9081ef0 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class CrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1], + "If Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(label_dims[1], 1UL, + "If Attr(softLabel) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + + ctx->SetOutputDim("Y", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class CrossEntropyGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0], + "The 1st dimension of Input(X) and Input(Y@Grad) should " + "be equal."); + PADDLE_ENFORCE_EQ(dy_dims[1], 1, + "The 2nd dimension of Input(Y@Grad) should be 1."); + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1], + "When Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(label_dims[1], 1, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape [N x D]," + " where N is the batch size and D is the number of classes. " + "This input is a probability computed by the previous operator, " + "which is almost always the result of a softmax operator."); + AddInput("Label", + "(Tensor), the ground truth which is a 2-D tensor. When " + "soft_label is set to false, Label is a Tensor with shape " + "[N x 1]. When soft_label is set to true, Label is a " + "Tensor with shape [N x D]."); + AddOutput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape " + "[N x 1]. The cross entropy loss."); + AddAttr("soft_label", + "(bool, default false), a flag indicating whether to " + "interpretate the given labels as soft labels.") + .SetDefault(false); + AddComment(R"DOC( +CrossEntropy Operator. + +It supports both standard cross-entropy and soft-label cross-entropy loss +computation. +1) One-hot cross-entropy: + soft_label = false, Label[i, 0] indicates the class index for sample i: + + $Y[i] = -\log(X[i, Label[i]])$ + +2) Soft-label cross-entropy: + soft_label = true, Label[i, j] indicates the soft label of class j + for sample i: + + $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$ + + Please make sure that in this case the summuation of each row of Label + equals one. + +3) One-hot cross-entropy with vecterized Input(Label): + As a special case of 2), when each row of Input(Label) has only one + non-zero element (equals 1), soft-label cross-entropy degenerates to a + one-hot cross-entropy with one-hot label representation. + +Both the input X and Label can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, + cross_entropy_grad, ops::CrossEntropyGradientOp); +REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel); +REGISTER_OP_CPU_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..de0976c69fc65ebc2ef7df9025d6071545e91c33 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy_op.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X, + const int64_t* label, const int N, + const int D) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { + int idx = i * D + label[i]; + dX[idx] = -dY[i] / X[idx]; + } +} + +template +__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X, + const T* label, const int N, + const int D) { + int ids = blockIdx.x * blockDim.x + threadIdx.x; + if (ids < N * D) { + int row_ids = ids / D; + dX[ids] = -label[ids] * dY[row_ids] / X[ids]; + } +} +} // namespace + +template +class CrossEntropyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + const Tensor* x = ctx.Input("X"); + const Tensor* label = ctx.Input("Label"); + Tensor* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, label, + ctx.Attr("soft_label")); + } +}; + +template +class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + const Tensor* x = ctx.Input("X"); + const Tensor* label = ctx.Input("Label"); + Tensor* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + + const T* dy_data = + ctx.Input(framework::GradVarName("Y"))->data(); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* x_data = x->data(); + + int64_t batch_size = x->dims()[0]; + int64_t class_num = x->dims()[1]; + + int block = 512; + int grid = (batch_size * class_num + block - 1) / block; + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + if (ctx.Attr("soft_label")) { + auto* label_data = label->data(); + SoftCrossEntropyGradientKernel<<>>( + dx_data, dy_data, x_data, label_data, batch_size, class_num); + } else { + math::SetConstant functor; + functor(dev_ctx, dx, 0); + auto* label_data = label->data(); + grid = (batch_size + block - 1) / block; + CrossEntropyGradientKernel<<>>( + dx_data, dy_data, x_data, label_data, batch_size, class_num); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, + ops::CrossEntropyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpCUDAKernel, + ops::CrossEntropyGradientOpCUDAKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5b20ecb70887dd03865e53f168dad818195b16 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class CrossEntropyOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + const Tensor* x = ctx.Input("X"); + const Tensor* labels = ctx.Input("Label"); + Tensor* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, labels, + ctx.Attr("soft_label")); + } +}; + +template +class CrossEntropyGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + const Tensor* x = ctx.Input("X"); + const Tensor* dy = ctx.Input(framework::GradVarName("Y")); + const Tensor* label = ctx.Input("Label"); + Tensor* dx = ctx.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + + int64_t class_num = x->dims()[1]; + if (ctx.Attr("soft_label")) { + auto x_mat = EigenMatrix::From(*x); + auto dy_mat = EigenMatrix::From(*dy); + auto lbl_mat = EigenMatrix::From(*label); + auto dx_mat = EigenMatrix::From(*dx); + + dx_mat.device(*ctx.template device_context() + .eigen_device()) = + -(lbl_mat * + dy_mat.broadcast(Eigen::DSizes(1, class_num)) / x_mat); + } else { + int64_t batch_size = x->dims()[0]; + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int64_t* label_data = label->data(); + + math::SetConstant functor; + functor(ctx.template device_context(), dx, 0); + + for (int64_t i = 0; i < batch_size; ++i) { + PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); + int64_t index = i * class_num + label_data[i]; + dx_data[index] = -dy_data[i] / x_data[index]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c7db78813e3bdd90d1c65e3af26208ae2a9ba21 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/ctc_align_op.h" + +namespace paddle { +namespace operators { + +class CTCAlignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input of CTCAlignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output of CTCAlignOp should not be null."); + + auto input_dims = ctx->GetInputDim("Input"); + + // TODO(wanghaoshuang): it is tricky to set the wrong dimension here. + ctx->SetOutputDim("Output", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LodTensor, default: LoDTensor), Its shape is " + "[Lp, 1], where Lp is the sum of all input sequences' length."); + AddOutput("Output", "(Tensor, default: Tensor), The align result."); + AddAttr("blank", + "(int, default: 0), the blank label setted in Connectionist " + "Temporal Classification (CTC) op.") + .SetDefault(0); + AddAttr("merge_repeated", + "(bool, default: true), whether to " + "merge repeated elements between two blanks. ") + .SetDefault(true); + AddComment(R"DOC( +CTCAlign op is used to merge repeated elements between two blanks +and then delete all blanks in sequence. + +Given: + Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, + 6, 0, 0, 7, 7, 7, 0] + Input.dims = {18, 1} + Input.LoD = [[0, 11, 18]] + +And: + blank = 0 + merge_repeated = True + +Then: + Output.data = [1, 2, 4, 4, 5, 6, + 6, 7] + Output.dims = {8, 1} + Output.LoD = [[0, 6, 8]] + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + ctc_align, ops::CTCAlignKernel, + ops::CTCAlignKernel); diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f629e0a9f15192c4d0d6fa5b8a122811d11ca415 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/operators/ctc_align_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, + const size_t num_seq, size_t* lod0, + const int blank, const int merge_repeated, + size_t* out_lod0, T* output) { + int ouput_idx = 0; + out_lod0[0] = 0; + + for (int i = 0; i < num_seq; ++i) { + T pre_token = -1; + for (int j = lod0[i]; j < lod0[i + 1]; ++j) { + if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) { + output[ouput_idx] = tokens[j]; + ++ouput_idx; + } + pre_token = tokens[j]; + } + out_lod0[i + 1] = ouput_idx; + } +} + +template +class CTCAlignOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + const size_t level = 0; + auto* input = ctx.Input("Input"); + auto* output = ctx.Output("Output"); + auto input_lod = framework::ToAbsOffset(input->lod()); + + const T* tokens = input->data(); + const int64_t num_tokens = input->dims()[0]; + const size_t num_seq = input_lod[level].size() - 1; + + const int blank = ctx.Attr("blank"); + const int merge_repeated = + static_cast(ctx.Attr("merge_repeated")); + + // prepare a lod to record lod information while merging elements + thrust::device_vector dev_out_lod0(input_lod[level].size()); + size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data()); + + // merge elements and delete blank + T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + + auto stream = ctx.cuda_device_context().stream(); + MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( + num_tokens, tokens, num_seq, + input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, + dev_out_lod0_ptr, output_data); + + // set output lod + std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); + framework::LoD out_lod; + out_lod.push_back(host_out_lod0); + output->set_lod(out_lod); + + // resize output dims + output->Resize({static_cast(host_out_lod0.back()), 1}); + + if (host_out_lod0.back() == 0) { + output->Resize({1, 1}); + output->mutable_data(ctx.GetPlace()); + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + output, -1); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel, + paddle::operators::CTCAlignOpCUDAKernel); diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1ef034c2f5b566e3cf720e295953fd7a69dd5812 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class CTCAlignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* output = ctx.Output("Output"); + const size_t level = 0; + auto input_lod = framework::ToAbsOffset(input->lod()); + + // check input dims and lod + auto input_dims = input->dims(); + PADDLE_ENFORCE_EQ(input_dims[0], + static_cast(input_lod[level].back()), + "The first dimension of Input(Input) should be equal to " + "the sum of all sequences' lengths."); + + const size_t num_sequences = input_lod[level].size() - 1; + size_t blank = static_cast(ctx.Attr("blank")); + bool merge_repeated = ctx.Attr("merge_repeated"); + + // merge repeated tokens and delete blank + T* output_data = output->mutable_data(ctx.GetPlace()); + size_t output_idx = 0; + std::vector output_lod0(1, 0); + const T* input_data = input->data(); + for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { + T prev_token = -1; + for (size_t i = input_lod[level][seq_idx]; + i < input_lod[level][seq_idx + 1]; ++i) { + if ((unsigned)input_data[i] != blank && + !(merge_repeated && input_data[i] == prev_token)) { + output_data[output_idx] = input_data[i]; + ++output_idx; + } + prev_token = input_data[i]; + } + output_lod0.push_back(output_idx); + } + + // set output lod + framework::LoD output_lod; + output_lod.push_back(output_lod0); + output->set_lod(output_lod); + // resize output dims + output->Resize({static_cast(output_lod0.back()), 1}); + // for empty sequence + if (output_lod0.back() == 0) { + output->Resize({1, 1}); + output_data = output->mutable_data(ctx.GetPlace()); + output_data[0] = -1; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3b2249147848b790833d09a0abe0370057ddd617 --- /dev/null +++ b/paddle/fluid/operators/cum_op.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class CumKernel : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + auto& X = detail::Ref(context.Input("X"), + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + + auto& Out = detail::Ref(context.Output("Out"), + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + int axis = context.Attr("axis"); + bool exclusive = context.Attr("exclusive"); + bool reverse = context.Attr("reverse"); + auto x_dims = X.dims(); + if (axis == -1) { + axis = x_dims.size() - 1; + } + PADDLE_ENFORCE_LT( + axis, x_dims.size(), + "axis should be less than the dimensiotn of the input tensor"); + Out.mutable_data(context.GetPlace()); + + int pre = 1; + int post = 1; + int mid = x_dims[axis]; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + for (int i = axis + 1; i < x_dims.size(); ++i) { + post *= x_dims[i]; + } + + auto x = framework::EigenVector::Flatten(X); + auto out = framework::EigenVector::Flatten(Out); + auto* place = + context.template device_context().eigen_device(); + + using IndexT = Eigen::DenseIndex; + if (pre == 1) { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(mid), x, out, + /* axis= */ 0, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(mid, post), x, out, + /* axis= */ 0, reverse, exclusive); + } + } else { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, + /* axis= */ 1, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, + /* axis= */ 1, reverse, exclusive); + } + } + } + + private: + template + void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, + bool reverse, bool exclusive) const { + if (!reverse) { + out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); + } else { + std::array rev; + rev.fill(false); + rev[axis] = reverse; + out.reshape(dims).device(d) = + Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); + } + } +}; + +template +struct CumsumFunctor { + using ELEMENT_TYPE = T; + template + const typename X::TensorScanSumOp operator()(X x, int axis, + bool exclusive) const { + return x.cumsum(axis, exclusive); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d15d4e3db35c4cd27f7b990a39a40af57acd5a65 --- /dev/null +++ b/paddle/fluid/operators/cumsum_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cum_op.h" + +namespace paddle { +namespace operators { + +class CumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Cumsum operator"); + AddOutput("Out", "Output of Cumsum operator"); + AddAttr("axis", + "(int, default -1). The dimenstion to accumulate along. " + "-1 means the last dimenstion") + .SetDefault(-1) + .EqualGreaterThan(-1); + AddAttr("exclusive", + "bool, default false). Whether to perform exclusive cumsum") + .SetDefault(false); + AddAttr("reverse", + "bool, default false). If true, the cumsum is performed in " + "the reversed direction") + .SetDefault(false); + AddComment(R"DOC( +The cumulative sum of the elements along a given axis. +By default, the first element of the result is the same of the first element of +the input. If exlusive is true, the first element of the result is 0. +)DOC"); + } +}; + +class CumsumGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("cumsum"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", Attr("axis")); + grad_op->SetAttr("reverse", !Attr("reverse")); + grad_op->SetAttr("exclusive", Attr("exclusive")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker); +REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>) diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e063cc0f65a5d63f8f558c5f16e548a1e1fcd4f6 --- /dev/null +++ b/paddle/fluid/operators/cumsum_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cum_op.h" + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>) diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d827155919ed060df6bb45bbb54c286e81cb6c81 --- /dev/null +++ b/paddle/fluid/operators/decayed_adagrad_op.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/decayed_adagrad_op.h" + +namespace paddle { +namespace operators { + +class DecayedAdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("LearningRate"), + "Input(LearningRate) of DecayedAdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of DecayedAdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of DecayedAdagradOp should have " + "the same dimension."); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of DecayedAdagradOp should have " + "the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("decay", + "(float, default 0.95) " + "Discounting factor for coming gradient") + .SetDefault(0.95); + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( +Decayed Adagrad Optimizer. + +The update is done as follows: + +$$ +moment\_out = decay * moment + (1 - decay) * grad * grad \\ +param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon} +$$ + +The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) +does not have an epsilon attribute. It is added here for numerical +stability to avoid the division by zero error. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, + ops::DecayedAdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/decayed_adagrad_op.cu b/paddle/fluid/operators/decayed_adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..215d6dbc7d80405bf2fdd340c280c299de9e8cc7 --- /dev/null +++ b/paddle/fluid/operators/decayed_adagrad_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/decayed_adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..52b67586ea3d138f738956c450416698042590fa --- /dev/null +++ b/paddle/fluid/operators/decayed_adagrad_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DecayedAdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + float decay = ctx.Attr("decay"); + float epsilon = ctx.Attr("epsilon"); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto& place = *ctx.template device_context().eigen_device(); + + moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt similarity index 100% rename from paddle/operators/detail/CMakeLists.txt rename to paddle/fluid/operators/detail/CMakeLists.txt diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d395d347ba4f48bae6b879c1daa3adb6f838e77 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "grpc_client.h" +#include "paddle/fluid/framework/threadpool.h" +namespace paddle { +namespace operators { +namespace detail { + +bool RPCClient::AsyncSendVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage req; + SerializeToMessage(var_name_val, var, *p_ctx, &req); + + // varhandle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = var_name_val; + var_h.ctx = p_ctx; + + // stub context + SendProcessor* s = new SendProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = NULL; + + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + }); + + req_count_++; + + return true; +} + +void ProcGetResponse(const VarHandle& var_h, + const sendrecv::VariableMessage& ret_msg) { + auto* outvar = var_h.scope->FindVar(var_h.name); + DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); +} + +bool RPCClient::AsyncGetVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + + // varhandle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = var_name_val; + var_h.ctx = p_ctx; + + // stub context + GetProcessor* s = new GetProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = ProcGetResponse; + + auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + }); + + req_count_++; + + return true; +} + +bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { + const auto ch = GetChannel(ep); + + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + s->Prepare(time_out); + + sendrecv::VariableMessage req; + req.set_varname(BATCH_BARRIER_MESSAGE); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + req_count_++; + + return true; +} + +bool RPCClient::Wait() { + if (req_count_ <= 0) { + return true; + } + const size_t kReqCnt = req_count_; + bool a[kReqCnt]; + std::vector> waits(req_count_); + + for (int i = 0; i < req_count_; i++) { + waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); + } + + for (int i = 0; i < req_count_; i++) { + waits[i].wait(); + } + + int last_req_count = req_count_; + req_count_ = 0; + + for (int i = 0; i < last_req_count; i++) { + if (!a[i]) { + return false; + } + } + + return true; +} + +bool RPCClient::Proceed() { + void* tag = NULL; + bool ok = false; + + // request counts. + if (!cq_.Next(&tag, &ok)) { + LOG(ERROR) << "Get meets CompletionQueue error"; + return false; + } + + GPR_ASSERT(ok); + PADDLE_ENFORCE(tag); + + // TODO(gongwb): add more retries. + ClientBase* c = static_cast(tag); + if (!c->status_.ok()) { + LOG(ERROR) << "proc param error:" << c->var_h_.String() + << " grpc error:" << c->status_.error_message(); + delete c; + return false; + } + + c->Process(); + delete c; + return true; +} + +std::shared_ptr RPCClient::GetChannel(const std::string& ep) { + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + + grpc::ChannelArguments args; + args.SetMaxSendMessageSize(std::numeric_limits::max()); + args.SetMaxReceiveMessageSize(std::numeric_limits::max()); + + auto ch = std::shared_ptr( + grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args)); + + channels_[ep] = ch; + return ch; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..314fe8168f0ecdae7b5d2737279050f54185e02a --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -0,0 +1,171 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" + +namespace paddle { +namespace operators { +namespace detail { + +struct VarHandle { + std::string ep; + const platform::DeviceContext* ctx; + const framework::Scope* scope; + std::string name; + + std::string String() const { + std::ostringstream s; + s << "name:[" << name << "] ep:[" << ep << "]"; + return s.str(); + } +}; + +void ProcGetResponse(const VarHandle& var_h, + const sendrecv::VariableMessage& msg); + +class ClientBase { + public: + explicit ClientBase(std::shared_ptr ch) { + stub_ = sendrecv::SendRecvService::NewStub(ch); + context_ = NULL; + } + + virtual ~ClientBase() {} + + virtual void Prepare(const VarHandle& var_info, int64_t time_out) { + context_.reset(new grpc::ClientContext()); + var_h_ = var_info; + + std::chrono::system_clock::time_point deadline = + std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); + + context_->set_deadline(deadline); + } + + virtual void Prepare(int64_t time_out) { + context_.reset(new grpc::ClientContext()); + + std::chrono::system_clock::time_point deadline = + std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); + + context_->set_deadline(deadline); + } + + virtual void Process() = 0; + + std::unique_ptr stub_; + std::unique_ptr context_; + grpc::Status status_; + VarHandle var_h_; +}; + +typedef std::function + RequestSendCallBack; + +class SendProcessor : public ClientBase { + public: + explicit SendProcessor(std::shared_ptr ch) : ClientBase(ch) {} + + virtual ~SendProcessor() {} + + virtual void Process() { + if (response_call_back_) { + response_call_back_(var_h_, reply_); + } + } + + sendrecv::VoidMessage reply_; + RequestSendCallBack response_call_back_ = NULL; +}; + +typedef std::function + RequestGetCallBack; + +class GetProcessor : public ClientBase { + public: + explicit GetProcessor(std::shared_ptr ch) : ClientBase(ch) {} + + virtual ~GetProcessor() {} + + virtual void Process() { + if (response_call_back_) { + response_call_back_(var_h_, reply_); + } + } + + sendrecv::VariableMessage reply_; + RequestGetCallBack response_call_back_ = ProcGetResponse; +}; + +class BatchBarrierProcessor : public ClientBase { + public: + explicit BatchBarrierProcessor(std::shared_ptr ch) + : ClientBase(ch) {} + + virtual ~BatchBarrierProcessor() {} + + virtual void Process() {} + sendrecv::VoidMessage reply_; +}; + +class RPCClient { + public: + bool AsyncSendVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = 600 * 1000); + + bool AsyncGetVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = 600 * 1000); + + bool AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out = 600 * 1000); + + bool Wait(); + + private: + bool Proceed(); + std::shared_ptr GetChannel(const std::string& ep); + + private: + grpc::CompletionQueue cq_; + std::map> channels_; + int64_t req_count_ = 0; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..96f4ea797b1d8e82fa1c2a52a8b353259906bac2 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detail/grpc_server.h" + +using grpc::ServerAsyncResponseWriter; + +namespace paddle { +namespace operators { +namespace detail { + +enum CallStatus { PROCESS = 0, FINISH }; + +// reference: +// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server +class RequestBase { + public: + explicit RequestBase(sendrecv::SendRecvService::AsyncService* service, + grpc::ServerCompletionQueue* cq) + : service_(service), cq_(cq), status_(PROCESS) { + PADDLE_ENFORCE(cq_); + } + virtual ~RequestBase() {} + virtual void Process() { assert(false); } + + CallStatus Status() { return status_; } + void SetStatus(CallStatus status) { status_ = status; } + virtual std::string GetReqName() { + assert(false); + return ""; + } + + protected: + grpc::ServerContext ctx_; + sendrecv::SendRecvService::AsyncService* service_; + grpc::ServerCompletionQueue* cq_; + CallStatus status_; +}; + +typedef std::pair MessageWithName; + +class RequestSend final : public RequestBase { + public: + explicit RequestSend(sendrecv::SendRecvService::AsyncService* service, + grpc::ServerCompletionQueue* cq, + SimpleBlockQueue* queue) + : RequestBase(service, cq), queue_(queue), responder_(&ctx_) { + service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_, + this); + } + + virtual ~RequestSend() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + MessageWithName msg_with_name = + std::make_pair(request_.varname(), std::move(request_)); + queue_->Push(std::move(msg_with_name)); + responder_.Finish(reply_, grpc::Status::OK, this); + status_ = FINISH; + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VoidMessage reply_; + SimpleBlockQueue* queue_; + ServerAsyncResponseWriter responder_; +}; + +class RequestGet final : public RequestBase { + public: + explicit RequestGet(sendrecv::SendRecvService::AsyncService* service, + grpc::ServerCompletionQueue* cq, framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + SimpleBlockQueue* queue) + : RequestBase(service, cq), + responder_(&ctx_), + scope_(scope), + dev_ctx_(dev_ctx), + queue_(queue) { + service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this); + } + + virtual ~RequestGet() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + // proc request. + std::string var_name = request_.varname(); + auto* var = scope_->FindVar(var_name); + SerializeToMessage(var_name, var, *dev_ctx_, &reply_); + // TODO(gongwb): check var's info. + responder_.Finish(reply_, grpc::Status::OK, this); + status_ = FINISH; + queue_->Push('c'); + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VariableMessage reply_; + ServerAsyncResponseWriter responder_; + framework::Scope* scope_; + const platform::DeviceContext* dev_ctx_; + SimpleBlockQueue* queue_; +}; + +void AsyncGRPCServer::WaitClientGet(int count) { + for (int i = 0; i < count; ++i) { + var_get_queue_.Pop(); + } +} + +void AsyncGRPCServer::RunSyncUpdate() { + grpc::ServerBuilder builder; + builder.AddListeningPort(address_, grpc::InsecureServerCredentials()); + builder.SetMaxSendMessageSize(std::numeric_limits::max()); + builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); + builder.RegisterService(&service_); + + cq_send_ = builder.AddCompletionQueue(); + cq_get_ = builder.AddCompletionQueue(); + + server_ = builder.BuildAndStart(); + LOG(INFO) << "Server listening on " << address_ << std::endl; + + std::function send_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this); + std::function get_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); + + t_send_.reset( + new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, + cq_send_.get(), "cq_send", send_register))); + + t_get_.reset( + new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, + cq_get_.get(), "cq_get", get_register))); + + // wait server + server_->Wait(); + t_send_->join(); + t_get_->join(); +} + +void AsyncGRPCServer::ShutdownQueue() { + std::unique_lock lock(cq_mutex_); + cq_send_->Shutdown(); + cq_get_->Shutdown(); + is_shut_down_ = true; +} + +// This URL explains why shutdown is complicate: +void AsyncGRPCServer::ShutDown() { + server_->Shutdown(); + ShutdownQueue(); +} + +void AsyncGRPCServer::TryToRegisterNewSendOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + return; + } + RequestSend* send = + new RequestSend(&service_, cq_send_.get(), &var_recv_queue_); + VLOG(4) << "Create RequestSend status:" << send->Status(); +} + +void AsyncGRPCServer::TryToRegisterNewGetOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + return; + } + RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_, + &var_get_queue_); + VLOG(4) << "Create RequestGet status:" << get->Status(); +} + +// FIXME(typhoonzero): change cq_name to enum. +void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq, + std::string cq_name, + std::function TryToRegisterNewOne) { + TryToRegisterNewOne(); + + void* tag = NULL; + bool ok = false; + while (true) { + if (!cq->Next(&tag, &ok)) { + LOG(INFO) << cq_name << " get CompletionQueue shutdown!"; + break; + } + + PADDLE_ENFORCE(tag); + // FIXME(typhoonzero): de-couple the barriers with recv_op + if (cq_name == "cq_get") WaitCond(1); + if (cq_name == "cq_send") WaitCond(0); + + RequestBase* base = (RequestBase*)tag; + // reference: + // https://github.com/tensorflow/tensorflow/issues/5596 + // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM + // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I + if (!ok) { + LOG(WARNING) << cq_name << " recv no regular event:argument name" + << base->GetReqName(); + TryToRegisterNewOne(); + delete base; + continue; + } + + switch (base->Status()) { + case PROCESS: { + VLOG(4) << cq_name << " status:" << base->Status(); + TryToRegisterNewOne(); + base->Process(); + break; + } + case FINISH: { + VLOG(4) << cq_name << " status:" << base->Status(); + delete base; + break; + } + default: { assert(false); } + } + } +} + +void AsyncGRPCServer::WaitCond(int cond) { + std::unique_lock lock(this->barrier_mutex_); + barrier_condition_.wait(lock, + [=] { return this->barrier_cond_step_ == cond; }); +} + +void AsyncGRPCServer::SetCond(int cond) { + { + std::lock_guard lock(this->barrier_mutex_); + barrier_cond_step_ = cond; + } + barrier_condition_.notify_all(); +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h new file mode 100644 index 0000000000000000000000000000000000000000..1382d1731838c72008ef782d1c398f534f23f7e6 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" + +#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +#include +#include +#include +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace detail { + +typedef std::pair MessageWithName; +class RequestBase; + +class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { + public: + explicit AsyncGRPCServer(const std::string &address) : address_(address) {} + + void RunSyncUpdate(); + + // functions to sync server barrier status. + void WaitCond(int cond); + void SetCond(int cond); + void WaitClientGet(int count); + + void SetScope(framework::Scope *scope) { scope_ = scope; } + + void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } + + const MessageWithName Get() { return this->var_recv_queue_.Pop(); } + + void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); } + + void ShutDown(); + + protected: + void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name, + std::function TryToRegisterNewOne); + void TryToRegisterNewSendOne(); + void TryToRegisterNewGetOne(); + void ShutdownQueue(); + + private: + std::mutex cq_mutex_; + volatile bool is_shut_down_ = false; + std::unique_ptr cq_send_; + std::unique_ptr cq_get_; + + sendrecv::SendRecvService::AsyncService service_; + std::unique_ptr server_; + + std::string address_; + framework::Scope *scope_; + const platform::DeviceContext *dev_ctx_; + // received variable from RPC, operators fetch variable from this queue. + SimpleBlockQueue var_recv_queue_; + SimpleBlockQueue var_get_queue_; + + // condition of the sub program + std::mutex barrier_mutex_; + mutable int barrier_cond_step_; + std::condition_variable barrier_condition_; + + std::unique_ptr t_send_; + std::unique_ptr t_get_; +}; + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h similarity index 100% rename from paddle/operators/detail/safe_ref.h rename to paddle/fluid/operators/detail/safe_ref.h diff --git a/paddle/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto similarity index 100% rename from paddle/operators/detail/send_recv.proto rename to paddle/fluid/operators/detail/send_recv.proto diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba3ae6add6099cb232a5e0df82550b9c2628c05c --- /dev/null +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace detail { + +void SerializeToMessage(const std::string& name, const framework::Variable* var, + const platform::DeviceContext& ctx, + sendrecv::VariableMessage* msg) { + msg->set_varname(name); + std::ostringstream oss; + switch (framework::ToVarType(var->Type())) { + case framework::proto::VarDesc_VarType_LOD_TENSOR: + msg->set_type(sendrecv::VarType::LOD_TENSOR); + framework::SerializeToStream(oss, var->Get(), ctx); + break; + case framework::proto::VarDesc_VarType_SELECTED_ROWS: + msg->set_type(sendrecv::VarType::SELECTED_ROWS); + framework::SerializeToStream(oss, var->Get(), + ctx); + break; + default: { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + } + msg->set_serialized(oss.str()); +} + +void DeserializeFromMessage(const sendrecv::VariableMessage& msg, + const platform::DeviceContext& ctx, + framework::Variable* var) { + std::istringstream iss(msg.serialized()); + switch (msg.type()) { + case sendrecv::VarType::LOD_TENSOR: + DeserializeFromStream(iss, var->GetMutable(), ctx); + break; + case sendrecv::VarType::SELECTED_ROWS: { + DeserializeFromStream(iss, var->GetMutable(), + ctx); + break; + } + default: { + PADDLE_THROW("Deserialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + } +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fed887c02796463b4b1b7a747883c702c2a95a72 --- /dev/null +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace detail { + +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" + +void SerializeToMessage(const std::string& name, const framework::Variable* var, + const platform::DeviceContext& ctx, + sendrecv::VariableMessage* msg); + +void DeserializeFromMessage(const sendrecv::VariableMessage& msg, + const platform::DeviceContext& ctx, + framework::Variable* var); +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h similarity index 100% rename from paddle/operators/detail/simple_block_queue.h rename to paddle/fluid/operators/detail/simple_block_queue.h diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h new file mode 100644 index 0000000000000000000000000000000000000000..d7a7eed50b961b0efc04a2a636178fa6578cbf3a --- /dev/null +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace detail { + +template +struct StridedMemcpyFunctor; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + framework::Dim<1> src_stride, framework::Dim<1> dst_dim, + framework::Dim<1> dst_stride, T* dst) const { + auto place = dev_ctx.GetPlace(); + if (platform::is_cpu_place(place)) { + auto& cpu_place = boost::get(place); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); + } else { +#ifdef PADDLE_WITH_CUDA + auto& gpu_place = boost::get(place); + auto& cuda_ctx = + reinterpret_cast(dev_ctx); + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, + cuda_ctx.stream()); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } +}; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + framework::Dim src_stride, framework::Dim dst_dim, + framework::Dim dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim.head; ++i) { + StridedMemcpyFunctor func; + func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); + src += src_stride.head; + dst += dst_stride.head; + } + } +}; + +template +struct StridedCopyDimVisitor : public boost::static_visitor { + StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, + const framework::DDim& src_stride, + const framework::DDim& dst_stride, T* dst) + : dev_ctx_(dev_ctx), + src_(src), + src_stride_(src_stride), + dst_stride_(dst_stride), + dst_(dst) {} + + template + void operator()(Dim dst_dim) const { + Dim src_stride = boost::get(src_stride_); + Dim dst_stride = boost::get(dst_stride_); + constexpr int dim = Dim::dimensions; + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); + } + + const platform::DeviceContext& dev_ctx_; + const T* src_; + const framework::DDim& src_stride_; + const framework::DDim& dst_stride_; + T* dst_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection_output_op.cc b/paddle/fluid/operators/detection_output_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6dee5222959f00141cf5c09257d4a2c96b9e3746 --- /dev/null +++ b/paddle/fluid/operators/detection_output_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection_output_op.h" +namespace paddle { +namespace operators { + +class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Loc", + "(Tensor) The input tensor of detection_output operator." + "The input predict locations" + "The format of input tensor is kNCHW. Where K is priorbox point " + "numbers," + "N is How many boxes are there on each point, " + "C is 4, H and W both are 1."); + AddInput("Conf", + "(Tensor) The input tensor of detection_output operator." + "The input priorbox confidence." + "The format of input tensor is kNCHW. Where K is priorbox point " + "numbers," + "N is How many boxes are there on each point, " + "C is the number of classes, H and W both are 1."); + AddInput("PriorBox", + "(Tensor) The input tensor of detection_output operator." + "The format of input tensor is the position and variance " + "of the boxes"); + AddOutput("Out", + "(Tensor) The output tensor of detection_output operator."); + AddAttr("background_label_id", "(int), The background class index."); + AddAttr("num_classes", "(int), The number of the classification."); + AddAttr("nms_threshold", + "(float), The Non-maximum suppression threshold."); + AddAttr("confidence_threshold", + "(float), The classification confidence threshold."); + AddAttr("top_k", "(int), The bbox number kept of the layer’s output."); + AddAttr("nms_top_k", + "(int), The bbox number kept of the NMS’s output."); + AddComment(R"DOC( + detection output for SSD(single shot multibox detector) + Apply the NMS to the output of network and compute the predict + bounding box location. The output’s shape of this layer could + be zero if there is no valid bounding box. + )DOC"); + } +}; + +class DetectionOutputOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Loc"), + "Input(X) of DetectionOutputOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Conf"), + "Input(X) of DetectionOutputOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(X) of DetectionOutputOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of DetectionOutputOp should not be null."); + std::vector output_shape({1, 7}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp, + ops::DetectionOutputOpMaker); +REGISTER_OP_CPU_KERNEL( + detection_output, + ops::DetectionOutputKernel, + ops::DetectionOutputKernel); diff --git a/paddle/fluid/operators/detection_output_op.cu.cc b/paddle/fluid/operators/detection_output_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..309e03a25be95362b11689f873bafe68570c42e4 --- /dev/null +++ b/paddle/fluid/operators/detection_output_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection_output_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + detection_output, + ops::DetectionOutputKernel, + ops::DetectionOutputKernel); diff --git a/paddle/fluid/operators/detection_output_op.h b/paddle/fluid/operators/detection_output_op.h new file mode 100644 index 0000000000000000000000000000000000000000..05e5b72bd354329d575c33a88189cfbc64abfea9 --- /dev/null +++ b/paddle/fluid/operators/detection_output_op.h @@ -0,0 +1,167 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/detection_util.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/strided_memcpy.h" +namespace paddle { +namespace operators { +template +inline void transpose_fun(const framework::ExecutionContext& context, + const framework::Tensor& src, + framework::Tensor* dst) { + int input_nums = src.dims()[0]; + int offset = 0; + for (int j = 0; j < input_nums; ++j) { + framework::Tensor in_p_tensor = src.Slice(j, j + 1); + std::vector shape_vec( + {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3], + in_p_tensor.dims()[4], in_p_tensor.dims()[2]}); + framework::DDim shape(framework::make_ddim(shape_vec)); + framework::Tensor in_p_tensor_transpose; + in_p_tensor_transpose.mutable_data(shape, context.GetPlace()); + std::vector shape_axis({0, 1, 3, 4, 2}); + math::Transpose trans5; + trans5(context.template device_context(), in_p_tensor, + &in_p_tensor_transpose, shape_axis); + auto dst_stride = framework::stride(dst->dims()); + auto src_stride = framework::stride(in_p_tensor_transpose.dims()); + StridedMemcpy(context.device_context(), in_p_tensor_transpose.data(), + src_stride, in_p_tensor_transpose.dims(), dst_stride, + dst->data() + offset); + offset += in_p_tensor_transpose.dims()[4] * src_stride[4]; + } +} +template +class DetectionOutputKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_loc = context.Input("Loc"); + const framework::Tensor* in_conf = context.Input("Conf"); + const framework::Tensor* in_priorbox = + context.Input("PriorBox"); + auto* out = context.Output("Out"); + int num_classes = context.template Attr("num_classes"); + int top_k = context.template Attr("top_k"); + int nms_top_k = context.template Attr("nms_top_k"); + int background_label_id = context.template Attr("background_label_id"); + float nms_threshold = context.template Attr("nms_threshold"); + float confidence_threshold = + context.template Attr("confidence_threshold"); + size_t batch_size = in_conf->dims()[1]; + int conf_sum_size = in_conf->numel(); + // for softmax + std::vector conf_shape_softmax_vec( + {conf_sum_size / num_classes, num_classes}); + framework::DDim conf_shape_softmax( + framework::make_ddim(conf_shape_softmax_vec)); + // for knchw => nhwc + std::vector loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3], + in_loc->dims()[4], + in_loc->dims()[2] * in_loc->dims()[0]}); + std::vector conf_shape_vec( + {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4], + in_conf->dims()[2] * in_conf->dims()[0]}); + framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); + framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); + framework::Tensor loc_tensor; + framework::Tensor conf_tensor; + loc_tensor.mutable_data(loc_shape, context.GetPlace()); + conf_tensor.mutable_data(conf_shape, context.GetPlace()); + // for cpu + framework::Tensor loc_cpu; + framework::Tensor conf_cpu; + framework::Tensor priorbox_cpu; + const T* priorbox_data = in_priorbox->data(); + transpose_fun(context, *in_loc, &loc_tensor); + transpose_fun(context, *in_conf, &conf_tensor); + conf_tensor.Resize(conf_shape_softmax); + math::SoftmaxFunctor()( + context.template device_context(), &conf_tensor, + &conf_tensor); + T* loc_data = loc_tensor.data(); + T* conf_data = conf_tensor.data(); + if (platform::is_gpu_place(context.GetPlace())) { + loc_cpu.mutable_data(loc_tensor.dims(), platform::CPUPlace()); + framework::Copy(loc_tensor, platform::CPUPlace(), + context.device_context(), &loc_cpu); + loc_data = loc_cpu.data(); + conf_cpu.mutable_data(conf_tensor.dims(), platform::CPUPlace()); + framework::Copy(conf_tensor, platform::CPUPlace(), + context.device_context(), &conf_cpu); + conf_data = conf_cpu.data(); + priorbox_cpu.mutable_data(in_priorbox->dims(), platform::CPUPlace()); + framework::Copy(*in_priorbox, platform::CPUPlace(), + context.device_context(), &priorbox_cpu); + priorbox_data = priorbox_cpu.data(); + } + // get decode bboxes + size_t num_priors = in_priorbox->numel() / 8; + std::vector>> all_decoded_bboxes; + for (size_t n = 0; n < batch_size; ++n) { + std::vector> decoded_bboxes; + for (size_t i = 0; i < num_priors; ++i) { + size_t prior_offset = i * 8; + size_t loc_pred_offset = n * num_priors * 4 + i * 4; + std::vector> prior_bbox_vec; + math::GetBBoxFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_vec); + std::vector> prior_bbox_var; + math::GetBBoxVarFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_var); + std::vector loc_pred_data; + for (size_t j = 0; j < 4; ++j) + loc_pred_data.push_back(*(loc_data + loc_pred_offset + j)); + math::BBox bbox = math::DecodeBBoxWithVar( + prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); + decoded_bboxes.push_back(bbox); + } + all_decoded_bboxes.push_back(decoded_bboxes); + } + std::vector>> all_indices; + int num_kept = math::GetDetectionIndices( + conf_data, num_priors, num_classes, background_label_id, batch_size, + confidence_threshold, nms_top_k, nms_threshold, top_k, + all_decoded_bboxes, &all_indices); + + if (num_kept <= 0) { + std::vector out_shape_vec({0, 0}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out->Resize(out_shape); + return; + } + std::vector out_shape_vec({num_kept, 7}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out->mutable_data(out_shape, context.GetPlace()); + framework::Tensor out_cpu; + T* out_data = out->data(); + if (platform::is_gpu_place(context.GetPlace())) { + out_cpu.mutable_data(out->dims(), platform::CPUPlace()); + out_data = out_cpu.data(); + } + math::GetDetectionOutput(conf_data, num_kept, num_priors, num_classes, + batch_size, all_indices, all_decoded_bboxes, + out_data); + if (platform::is_gpu_place(context.GetPlace())) { + framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(), + out); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1dc900512cb2c20cc1a39b5d11a78f5eb905dc5 --- /dev/null +++ b/paddle/fluid/operators/dropout_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dropout_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class DropoutOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + if (ctx->Attrs().Get("is_test") == false) { + ctx->SetOutputDim("Mask", x_dims); + } + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of dropout op."); + AddOutput("Out", "The output of dropout op."); + AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate(); + + AddAttr("dropout_prob", "Probability of setting units to zero.") + .SetDefault(.5f) + .AddCustomChecker([](const float& drop_p) { + PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f, + "'dropout_prob' must be between 0.0 and 1.0."); + }); + AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random mask. NOTE: DO NOT set this flag to true in " + "training. Setting this flag to true is only useful in " + "unittest or for debug that always the same output units " + "will be dropped.") + .SetDefault(false); + AddAttr("seed", "Dropout random seed.").SetDefault(0); + + AddComment(R"DOC( +Dropout Operator. + +Dropout refers to randomly dropping out units in a nerual network. It is a +regularization technique for reducing overfitting by preventing neuron +co-adaption during training. The dropout operator randomly set (according to +the given dropout probability) the outputs of some units to zero, while others +are set equal to their corresponding inputs. + +)DOC"); + } +}; + +template +class DropoutOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->Attrs().Get("is_test"), false, + "GradOp is only callable when is_test is false"); + + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) must not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE_EQ(x_dims, out_dims, + "Dimensions of Input(X) and Out@Grad must be the same."); + auto mask_dims = ctx->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ(x_dims, mask_dims, + "Dimensions of Input(X) and Mask must be the same."); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, + ops::DropoutOpGrad); +REGISTER_OP_CPU_KERNEL( + dropout, + ops::CPUDropoutKernel); +REGISTER_OP_CPU_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4ae9f4ce54d27dd1ad0312b5ad8d78a4cb904c79 --- /dev/null +++ b/paddle/fluid/operators/dropout_op.cu @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include +#include +#include +#include +#include "paddle/fluid/operators/dropout_op.h" + +namespace paddle { +namespace operators { + +template +struct MaskGenerator { + AttrType dropout_prob; + int seed; + + __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed) + : dropout_prob(dropout_prob), seed(seed) {} + + inline __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed); + thrust::uniform_real_distribution dist(0, 1); + rng.discard(n); + if (dist(rng) < dropout_prob) { + return static_cast(0); + } + return static_cast(1); + } +}; + +// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class GPUDropoutKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Output("Out"); + y->mutable_data(context.GetPlace()); + AttrType dropout_prob = context.Attr("dropout_prob"); + + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); + + auto& place = *context.template device_context().eigen_device(); + if (!context.Attr("is_test")) { + auto* mask = context.Output("Mask"); + auto* mask_data = mask->mutable_data(context.GetPlace()); + int size = framework::product(mask->dims()); + + std::random_device rnd; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + + thrust::counting_iterator index_sequence_begin(0); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(mask_data), + MaskGenerator(dropout_prob, seed)); + auto M = EigenMatrix::Reshape(*mask, 1); + Y.device(place) = X * M; + } else { + Y.device(place) = X * (1.0f - dropout_prob); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + dropout, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9dd1f33669ccf89202abe4a80bd9796411f630ba --- /dev/null +++ b/paddle/fluid/operators/dropout_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class CPUDropoutKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Output("Out"); + const auto* x_data = x->data(); + auto* y_data = y->mutable_data(context.GetPlace()); + float dropout_prob = context.Attr("dropout_prob"); + + if (!context.Attr("is_test")) { + auto* mask = context.Output("Mask"); + auto* mask_data = mask->mutable_data(context.GetPlace()); + + // NOTE: fixed seed should only be used in unittest or for debug. + // Guarantee to use random seed in training. + std::random_device rnd; + std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + engine.seed(seed); + + std::uniform_real_distribution dist(0, 1); + size_t size = framework::product(mask->dims()); + for (size_t i = 0; i < size; ++i) { + if (dist(engine) < dropout_prob) { + mask_data[i] = 0; + y_data[i] = 0; + } else { + mask_data[i] = 1; + y_data[i] = x_data[i]; + } + } + } else { + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); + auto& place = + *context.template device_context().eigen_device(); + Y.device(place) = X * (1.0f - dropout_prob); + } + } +}; + +template +class DropoutGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(!context.Attr("is_test"), + "GradOp is only callable when is_test is false"); + + auto* grad_x = context.Output(framework::GradVarName("X")); + auto* grad_y = context.Input(framework::GradVarName("Out")); + auto* mask = context.Input("Mask"); + grad_x->mutable_data(context.GetPlace()); + + auto M = EigenMatrix::Reshape(*mask, 1); + auto dX = EigenMatrix::Reshape(*grad_x, 1); + auto dY = EigenMatrix::Reshape(*grad_y, 1); + + auto& place = + *context.template device_context().eigen_device(); + dX.device(place) = dY * M; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae82408da71f9424a7a64dc9d3e42759707683b9 --- /dev/null +++ b/paddle/fluid/operators/edit_distance_op.cc @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/edit_distance_op.h" + +namespace paddle { +namespace operators { + +class EditDistanceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"), + "Output(SequenceNum) shouldn't be null."); + auto hyp_dims = ctx->GetInputDim("Hyps"); + auto ref_dims = ctx->GetInputDim("Refs"); + PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1, + "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension " + "equal to 1."); + PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1, + "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension " + "equal to 1."); + ctx->SetOutputDim("Out", ctx->GetInputDim("Refs")); + ctx->SetOutputDim("SequenceNum", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(framework::proto::DataType::FP32, + ctx.device_context()); + } +}; + +class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Hyps", + "(2-D LoDTensor, 2nd dim. equal to 1) " + "The indices for hypothesis strings."); + AddInput("Refs", + "(2-D LoDTensor, 2nd dim. equal to 1) " + "The indices for reference strings."); + AddOutput("SequenceNum", "The sequence count of current batch"); + AddAttr("normalized", + "(bool, default false) Indicated whether to normalize " + "the edit distance by the length of reference string.") + .SetDefault(false); + AddOutput("Out", + "(2-D Tensor with shape [`batch_size` x 1]) " + "The output edit distances of EditDistance operator."); + AddComment(R"DOC( + +EditDistance operator computes the edit distances between a batch of hypothesis +strings and their references. + +Edit distance, also called Levenshtein distance, measures how dissimilar two strings +are by counting the minimum number of operations to transform one string into anthor. +Here the operations include insertion, deletion, and substitution. For example, +given hypothesis string A = "kitten" and reference B = "sitting", the edit distance +is 3 for A will be transformed into B at least after two substitutions and one +insertion: + + "kitten" -> "sitten" -> "sittin" -> "sitting" + +Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total +number denoted by `batch_size`, and the separation is specified by the LoD information. +And the `batch_size` reference strings are arranged in order in the same way in the +LoDTensor Input(Refs). + +Output(Out) contains the `batch_size` results and each stands for the edit stance +for a pair of strings respectively. If Attr(normalized) is true, the edit distance +will be divided by the length of reference string. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + edit_distance, ops::EditDistanceKernel); diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bdfead75e71752549f44a8b3c9b9e4501e8845a3 --- /dev/null +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillFirstRow(T* dist, const int N) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < N + 1) { + dist[idx] = idx; + } +} + +template +__global__ void FillFirstColumn(T* dist, const int M, const int N) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < M + 1) { + dist[idx * (N + 1)] = idx; + } +} + +template +__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2, + const int M, const int N, const int start) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int offset = N; + int index = start + idx * offset; + int row = index / (N + 1); + int col = index % (N + 1); + if (row > 0 && col > 0 && row < M + 1 && col < N + 1) { + int cost = x1[row - 1] == x2[col - 1] ? 0 : 1; + int dels = dist[(row - 1) * (N + 1) + col] + 1; + int ins = dist[row * (N + 1) + col - 1] + 1; + int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost; + dist[index] = min(dels, min(ins, subs)); + } +} + +template +__global__ void SetOutput(T* out, const T* dist, const int M, const int N, + bool normalized) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx == 0) { + out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N]; + } +} + +template +class EditDistanceGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + + auto* x1_t = ctx.Input("Hyps"); + auto* x2_t = ctx.Input("Refs"); + auto* sequence_num = ctx.Output("SequenceNum"); + sequence_num->mutable_data(ctx.GetPlace()); + + auto normalized = ctx.Attr("normalized"); + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + + auto hyp_lod = x1_t->lod()[0]; + auto ref_lod = x2_t->lod()[0]; + PADDLE_ENFORCE( + hyp_lod.size() == ref_lod.size(), + "Input(Hyps) and Input(Refs) must have the same batch size."); + for (size_t i = 1; i < ref_lod.size(); ++i) { + PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1], + "Reference string %d is empty.", i); + } + + const size_t num_strs = hyp_lod.size() - 1; + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + sequence_num, static_cast(num_strs)); + + out_t->Resize({static_cast(num_strs), 1}); + out_t->mutable_data(ctx.GetPlace()); + auto out = out_t->data(); + + T distance = 0.0; + for (size_t num = 0; num < num_strs; num++) { + auto m = static_cast(hyp_lod[num + 1] - hyp_lod[num]); + auto n = static_cast(ref_lod[num + 1] - ref_lod[num]); + if (m == 0 || n == 0) { + distance = std::max(m, n); + if (normalized) { + PADDLE_ENFORCE(n > 0, + "The reference string (#%d) cannot be empty " + "when Attr(normalized) is enabled.", + n); + distance = distance / n; + } + memory::Copy(boost::get(ctx.GetPlace()), out + num, + platform::CPUPlace(), &distance, sizeof(T), stream); + } else { + framework::Tensor dist_t; + dist_t.Resize({m + 1, n + 1}); + dist_t.mutable_data(ctx.GetPlace()); + auto dist = dist_t.data(); + auto x1 = x1_t->data() + hyp_lod[num]; + auto x2 = x2_t->data() + ref_lod[num]; + + FillFirstColumn<<<1 + m / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n); + + FillFirstRow<<<1 + n / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n); + // Compute the elements of distance matrix in the anti-diagonal diretion + for (int64_t slice = 2; slice < m + n + 1; ++slice) { + int z_m = slice < m + 1 ? 0 : slice - m; + int z_n = slice < n + 1 ? 0 : slice - n; + int size = slice - (z_m + z_n) + 1; // number of elments in the same + // anti-diagonal line to update + // the start index at which computes from + int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1; + Levenshtein<<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2, + m, n, start); + } + SetOutput<<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + edit_distance, + ops::EditDistanceGPUKernel); diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h new file mode 100644 index 0000000000000000000000000000000000000000..205e16e6bfe6b2d1678fca258ce1e70d29eff331 --- /dev/null +++ b/paddle/fluid/operators/edit_distance_op.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +namespace paddle { +namespace operators { + +template +class EditDistanceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + + auto* x1_t = ctx.Input("Hyps"); + auto* x2_t = ctx.Input("Refs"); + auto* sequence_num = ctx.Output("SequenceNum"); + int64_t* seq_num_data = sequence_num->mutable_data(ctx.GetPlace()); + + auto normalized = ctx.Attr("normalized"); + + auto hyp_lod = x1_t->lod()[0]; + auto ref_lod = x2_t->lod()[0]; + PADDLE_ENFORCE( + hyp_lod.size() == ref_lod.size(), + "Input(Hyps) and Input(Refs) must have the same batch size."); + for (size_t i = 1; i < ref_lod.size(); ++i) { + PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1], + "Reference string %d is empty.", i); + } + auto num_strs = hyp_lod.size() - 1; + *seq_num_data = static_cast(num_strs); + + out_t->Resize({static_cast(num_strs), 1}); + out_t->mutable_data(ctx.GetPlace()); + auto out = out_t->data(); + + T distance = 0.0; + for (size_t num = 0; num < num_strs; ++num) { + auto m = static_cast(hyp_lod[num + 1] - hyp_lod[num]); + auto n = static_cast(ref_lod[num + 1] - ref_lod[num]); + + if (m == 0) { + distance = n; + } else if (n == 0) { + distance = m; + } else { + framework::Tensor dist_t; + dist_t.Resize({m + 1, n + 1}); + dist_t.mutable_data(ctx.GetPlace()); + auto dist = dist_t.data(); + auto x1 = x1_t->data() + hyp_lod[num]; + auto x2 = x2_t->data() + ref_lod[num]; + for (int64_t i = 0; i < m + 1; ++i) { + dist[i * (n + 1)] = i; + } + for (int64_t j = 0; j < n + 1; ++j) { + dist[j] = j; + } + for (int64_t i = 1; i < m + 1; ++i) { + for (int64_t j = 1; j < n + 1; ++j) { + int cost = x1[i - 1] == x2[j - 1] ? 0 : 1; + int dels = dist[(i - 1) * (n + 1) + j] + 1; + int ins = dist[i * (n + 1) + (j - 1)] + 1; + int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost; + dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs)); + } + } + distance = dist[m * (n + 1) + n]; + } + + if (normalized) { + PADDLE_ENFORCE(n > 0, + "The reference string (#%d) cannot be empty " + "when Attr(normalized) is enabled.", + n); + distance = distance / n; + } + out[num] = distance; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b9947b8c935fd2b4739162cbd2f98dc965cec2a --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_add_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseAddOpMaker : public ElementwiseOpMaker { + public: + ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Add", "Out = X + Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, + elementwise_add_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_add, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_add_grad, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2ac3a998ec46528ccb72fad1f5d73ce88992d995 --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_add_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_add, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_add_grad, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h new file mode 100644 index 0000000000000000000000000000000000000000..248e3b9d617fadd914f27ad02861e27078600b61 --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +template +class ElementwiseAddKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + AddFunctor(), z); + } +}; + +template +struct ElementwiseAddGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = dz_e; + } + } +}; + +template +struct ElementwiseAddBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = dz_e.reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseAddBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = dz_e.reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseAddGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseAddBroadCastGradFunctor, + ElementwiseAddBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..818ae82f44ccd159b36944e67521c3b730214539 --- /dev/null +++ b/paddle/fluid/operators/elementwise_div_op.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_div_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseDivOpMaker : public ElementwiseOpMaker { + public: + ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Div", "Out = X / Y"); + AddComment(comment_); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker, + elementwise_div_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_div, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d1bb7a474c06f68d33412512a9eb99757634d18e --- /dev/null +++ b/paddle/fluid/operators/elementwise_div_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_div_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_div, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8e0726d9465fa988646f6f8dc74857a3bedf43e8 --- /dev/null +++ b/paddle/fluid/operators/elementwise_div_op.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct DivFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a / b; } +}; + +template +class ElementwiseDivKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + DivFunctor(), z); + } +}; + +template +struct ElementwiseDivGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto y_e = framework::EigenVector::Flatten(*y); + auto z_e = framework::EigenVector::Flatten(*z); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e / y_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = -1.0 * dz_e * z_e / y_e; + } + } +}; + +template +struct ElementwiseDivBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e / y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseDivBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e / y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseDivGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseDivBroadCastGradFunctor, + ElementwiseDivBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1331bcadc8ce9a114d3dd7604273a3512e821e91 --- /dev/null +++ b/paddle/fluid/operators/elementwise_max_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_max_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseMaxOpMaker : public ElementwiseOpMaker { + public: + ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Max", "Out = max(X, Y)"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker, + elementwise_max_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_max, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_max_grad, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise_max_op.cu b/paddle/fluid/operators/elementwise_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f0259ad0024c1a9b640f73ade5711b6eaa8f871 --- /dev/null +++ b/paddle/fluid/operators/elementwise_max_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_max_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_max, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_max_grad, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e1db9bcc01104c3462b562b7a37b5817e867d7e4 --- /dev/null +++ b/paddle/fluid/operators/elementwise_max_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct MaxFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; } +}; + +template +class ElementwiseMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MaxFunctor(), z); + } +}; + +template +struct ElementwiseMaxGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e > y_e).template cast() * dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e <= y_e).template cast() * dz_e; + } + } +}; + +template +struct ElementwiseMaxBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseMaxBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseMaxGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseMaxBroadCastGradFunctor, + ElementwiseMaxBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d69099c8e6bd8e11a605758f553a9edd9cc322e --- /dev/null +++ b/paddle/fluid/operators/elementwise_min_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_min_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseMinOpMaker : public ElementwiseOpMaker { + public: + ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Max", "Out = min(X, Y)"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker, + elementwise_min_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_min, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_min_grad, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel); diff --git a/paddle/fluid/operators/elementwise_min_op.cu b/paddle/fluid/operators/elementwise_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed53204735056477b5c59ce5082d377501409c65 --- /dev/null +++ b/paddle/fluid/operators/elementwise_min_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_min_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_min, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_min_grad, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel); diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bfe213dd4318aef8b1fb299ba40981d2feef8f9d --- /dev/null +++ b/paddle/fluid/operators/elementwise_min_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct MinFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; } +}; + +template +class ElementwiseMinKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MinFunctor(), z); + } +}; + +template +struct ElementwiseMinGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e < y_e).template cast() * dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e >= y_e).template cast() * dz_e; + } + } +}; + +template +struct ElementwiseMinBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseMinBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseMinGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseMinBroadCastGradFunctor, + ElementwiseMinBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cb96f21d1b5049f9c7193f0a951e08986884c70 --- /dev/null +++ b/paddle/fluid/operators/elementwise_mul_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_mul_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMulOpMaker : public ElementwiseOpMaker { + public: + ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Mul", "Out = X \\odot\\ Y"); + AddComment(comment_); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker, + elementwise_mul_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_mul, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_mul_grad, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d72b6250eed24adbfc18e95a659a82b8e9916bc1 --- /dev/null +++ b/paddle/fluid/operators/elementwise_mul_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_mul_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_mul, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_mul_grad, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..dc292eb1e7295780aacb0c34730044c3c8759cb7 --- /dev/null +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct MulFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } +}; + +template +class ElementwiseMulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MulFunctor(), z); + } +}; + +template +struct ElementwiseMulGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e * y_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = x_e * dz_e; + } + } +}; + +template +struct ElementwiseMulBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e * y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e * dz_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseMulBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e * y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e * dz_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseMulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseMulBroadCastGradFunctor, + ElementwiseMulBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h new file mode 100644 index 0000000000000000000000000000000000000000..38f83d7ad36d3cb6b42a283fec3c431b51747d4e --- /dev/null +++ b/paddle/fluid/operators/elementwise_op.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +class ElementwiseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + using Tensor = framework::Tensor; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + ctx->SetOutputDim("Out", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), The first input tensor of elementwise op."); + AddInput("Y", "(Tensor), The second input tensor of elementwise op."); + AddOutput("Out", "The output of elementwise op."); + AddAttr("axis", + "(int, default -1). The start dimension index " + "for broadcasting Y onto X.") + .SetDefault(-1) + .EqualGreaterThan(-1); + comment_ = R"DOC( +Limited Elementwise {name} Operator. + +The equation is: + +$${equation}$$ + +$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be +smaller than or equal to the dimensions of $X$. + +There are two cases for this operator: +1. The shape of $Y$ is same with $X$; +2. The shape of $Y$ is a subset of $X$. + +For case 2: +$Y$ will be broadcasted to match the shape of $X$ and axis should be +set to index of the start dimension to broadcast $Y$ onto $X$. + +For example + .. code-block:: python + + shape(X) = (2, 3, 4, 5), shape(Y) = (,) + shape(X) = (2, 3, 4, 5), shape(Y) = (5,) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) + shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 + +Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details) +information. However, the output only shares the LoD information with input $X$. + +)DOC"; + AddComment(comment_); + } + + protected: + std::string comment_; + + void Replace(std::string& src, std::string from, std::string to) { + std::size_t len_from = std::strlen(from.c_str()); + std::size_t len_to = std::strlen(to.c_str()); + for (std::size_t pos = src.find(from); pos != std::string::npos; + pos = src.find(from, pos + len_to)) { + src.replace(pos, len_from, to); + } + } + + void SetComment(std::string name, std::string equation) { + Replace(comment_, "{name}", name); + Replace(comment_, "{equation}", equation); + } +}; + +class ElementwiseOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h new file mode 100644 index 0000000000000000000000000000000000000000..c1269382a447d4f8d089c5fc392495d418123e48 --- /dev/null +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -0,0 +1,406 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/transform.h" + +#ifdef __NVCC__ +#include +#endif + +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +/* + * Out = X ⊙ Y + * If Y's shape does not match X' shape, they will be reshaped. + * For example: + * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + * pre=2, n=3*4, post=5 + * x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5) + * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) + * pre=2*3, n=4*5, post=1 + * x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20) + */ +inline void get_mid_dims(const framework::DDim& x_dims, + const framework::DDim& y_dims, const int axis, + int& pre, int& n, int& post) { + pre = 1; + n = 1; + post = 1; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + + for (int i = 0; i < y_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], + "Broadcast dimension mismatch."); + n *= y_dims[i]; + } + + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + post *= x_dims[i]; + } +} + +template +class RowwiseTransformIterator; +template +class MidWiseTransformIterator; + +template +class RowwiseTransformIterator { + public: + RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} + + RowwiseTransformIterator& operator++() { + ++i_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + return *this; + } + + bool operator==(const RowwiseTransformIterator& + rhs) const { + return (ptr_ + i_) == &(*rhs); + } + + bool operator!=(const RowwiseTransformIterator& + rhs) const { + return (ptr_ + i_) != &(*rhs); + } + + const T& operator*() { return ptr_[i_]; } + + private: + const T* ptr_; + int i_; + int64_t n_; +}; + +template +class MidWiseTransformIterator { + public: + MidWiseTransformIterator(const T* ptr, int n, int post) + : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} + + MidWiseTransformIterator& operator++() { + ++j_; + if (UNLIKELY(j_ == post_)) { + ++i_; + j_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + return *this; + } + + bool operator==(const MidWiseTransformIterator& + rhs) const { + return (ptr_ + i_) == &(*rhs); + } + + bool operator!=(const MidWiseTransformIterator& + rhs) const { + return (ptr_ + i_) != &(*rhs); + } + + const T& operator*() { return ptr_[i_]; } + + private: + const T* ptr_; + int64_t i_; + int64_t j_; + int64_t n_; + int64_t post_; +}; + +#ifdef __NVCC__ +template +class RowwiseTransformIterator + : public thrust::iterator_adaptor< + RowwiseTransformIterator, const T*> { + public: + typedef thrust::iterator_adaptor< + RowwiseTransformIterator, const T*> + super_t; + HOSTDEVICE RowwiseTransformIterator(const T* x, int n) + : super_t(x), begin_(x), n_(n){}; + friend class thrust::iterator_core_access; + + private: + unsigned int n_; + const T* begin_; + HOSTDEVICE typename super_t::reference dereference() const { + return *(begin_ + (this->base() - begin_) % n_); + } +}; + +template +class MidWiseTransformIterator + : public thrust::iterator_adaptor< + MidWiseTransformIterator, const T*> { + public: + typedef thrust::iterator_adaptor< + MidWiseTransformIterator, const T*> + super_t; + HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) + : super_t(x), begin_(x), n_(n), post_(post){}; + friend class thrust::iterator_core_access; + + private: + unsigned int post_; + unsigned int n_; + const T* begin_; + HOSTDEVICE typename super_t::reference dereference() const { + return *(begin_ + (((this->base() - begin_) / post_) % n_)); + } +}; +#endif + +template +class TransformFunctor { + public: + TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z, const DeviceContext& ctx, Functor func) + : x_(x->data()), + y_(y->data()), + z_(z->mutable_data(ctx.GetPlace())), + nx_(x->numel()), + ctx_(ctx), + func_(func) {} + + inline void Run() const { + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, y_, z_, func_); + } + + inline void RunRowWise(int n, int pre) const { + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), + z_, func_); + } + + inline void RunMidWise(int n, int pre, int post) const { + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); + } + + private: + const T* x_; + const T* y_; + OutType* z_; + int64_t nx_; + const DeviceContext& ctx_; + Functor func_; +}; + +#define EIGEN_FUNCTOR(name, eigen_op) \ + struct Eigen##name##Functor { \ + template \ + inline void Run(const framework::Tensor* x, const framework::Tensor* y, \ + framework::Tensor* z, \ + const framework::ExecutionContext& ctx) { \ + auto x_e = framework::EigenVector::Flatten(*x); \ + auto y_e = framework::EigenVector::Flatten(*y); \ + auto z_e = framework::EigenVector::Flatten(*z); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_e); \ + } \ + template \ + inline void RunBroadCast(const framework::Tensor* x, \ + const framework::Tensor* y, framework::Tensor* z, \ + const framework::ExecutionContext& ctx, int pre, \ + int n) { \ + auto x_e = framework::EigenVector::Flatten(*x); \ + auto y_e = framework::EigenVector::Flatten(*y); \ + auto z_e = framework::EigenVector::Flatten(*z); \ + auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ + .broadcast(Eigen::DSizes(pre, 1)) \ + .reshape(Eigen::DSizes(x_e.size())); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ + } \ + template \ + inline void RunBroadCast2(const framework::Tensor* x, \ + const framework::Tensor* y, \ + framework::Tensor* z, \ + const framework::ExecutionContext& ctx, int pre, \ + int n, int post) { \ + auto x_e = framework::EigenVector::Flatten(*x); \ + auto y_e = framework::EigenVector::Flatten(*y); \ + auto z_e = framework::EigenVector::Flatten(*z); \ + auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ + .broadcast(Eigen::DSizes(pre, 1, post)) \ + .reshape(Eigen::DSizes(x_e.size())); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ + } \ + } + +template +void ElementwiseCompute(const framework::ExecutionContext& ctx) { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + if (x_dims == y_dims) { + functor f; + f.template Run(x, y, z, ctx); + return; + } + + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + if (post == 1) { + functor f; + f.template RunBroadCast(x, y, z, ctx, pre, n); + return; + } else { + functor f; + f.template RunBroadCast2(x, y, z, ctx, pre, n, post); + return; + } +} + +#define EIGEN_ADD(x, y) ((x) + (y)) +EIGEN_FUNCTOR(Add, EIGEN_ADD); + +#define EIGEN_SUB(x, y) ((x) - (y)) +EIGEN_FUNCTOR(Sub, EIGEN_SUB); + +#define EIGEN_MUL(x, y) ((x) * (y)) +EIGEN_FUNCTOR(Mul, EIGEN_MUL); + +#define EIGEN_DIV(x, y) ((x) / (y)) +EIGEN_FUNCTOR(Div, EIGEN_DIV); + +template +void ElementwiseGradCompute(const framework::ExecutionContext& ctx, + + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, int axis, + framework::Tensor* dx, framework::Tensor* dy) { + auto& place = *ctx.template device_context().eigen_device(); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + if (dx) { + dx->mutable_data(ctx.GetPlace()); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + } + + if (x_dims == y_dims) { + functor f; + f(place, x, y, out, dx, dy, dout); + return; + } + + if (y_dims.size() == 1 && y_dims[0] == 1) { + // y is a scalar + auto extended_dims = framework::vectorize(x_dims); + extended_dims.push_back(1); + x_dims = framework::make_ddim(extended_dims); + } + + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + + if (post == 1) { + broadcastfunctor f; + f(place, x, y, out, dx, dy, dout, pre, n); + return; + } else { + broadcast2functor f; + f(place, x, y, out, dx, dy, dout, pre, n, post); + return; + } +} + +template +void ElementwiseComputeEx(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, int axis, Functor func, + framework::Tensor* z) { + TransformFunctor functor( + x, y, z, ctx.template device_context(), func); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + if (x_dims == y_dims) { + functor.Run(); + return; + } + + if (y_dims.size() == 1 && y_dims[0] == 1) { + // y is a scalar + auto extended_dims = framework::vectorize(x_dims); + extended_dims.push_back(1); + x_dims = framework::make_ddim(extended_dims); + } + + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise_pow_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..911b5dbd2501e6c5ef6177a23592fadeb3383002 --- /dev/null +++ b/paddle/fluid/operators/elementwise_pow_op.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_pow_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwisePowOpMaker : public ElementwiseOpMaker { + public: + ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Pow", "Out = X ^ Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp, + ops::ElementwisePowOpMaker); +REGISTER_OP_CPU_KERNEL( + elementwise_pow, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); diff --git a/paddle/fluid/operators/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise_pow_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2996600738fe11c3fef67c3f4c5660ff05e37957 --- /dev/null +++ b/paddle/fluid/operators/elementwise_pow_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_pow_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_pow, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); diff --git a/paddle/fluid/operators/elementwise_pow_op.h b/paddle/fluid/operators/elementwise_pow_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b793c1eae0ec3a796897c7d81ac061f80ccffdb6 --- /dev/null +++ b/paddle/fluid/operators/elementwise_pow_op.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct PowFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); } +}; + +template +class ElementwisePowKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + PowFunctor(), z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..46ce01c7cf5bf4930d05535e22f1d54073838071 --- /dev/null +++ b/paddle/fluid/operators/elementwise_sub_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_sub_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseSubOpMaker : public ElementwiseOpMaker { + public: + ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Sub", "Out = X - Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker, + elementwise_sub_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_sub, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_sub_grad, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..eb09d6c5edcb6e8460de71d76077fd103d799847 --- /dev/null +++ b/paddle/fluid/operators/elementwise_sub_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_sub_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_sub, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_sub_grad, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h new file mode 100644 index 0000000000000000000000000000000000000000..af2d497b9ae8f892aa272211ee2158d063d13909 --- /dev/null +++ b/paddle/fluid/operators/elementwise_sub_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +class ElementwiseSubKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + SubFunctor(), z); + } +}; + +template +struct ElementwiseSubGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0) * dz_e; + } + } +}; + +template +struct ElementwiseSubBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0) * + dz_e.reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseSubBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0) * + dz_e.reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseSubGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseSubBroadCastGradFunctor, + ElementwiseSubBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ccb9a94856fe868c8069510a7c557dfb8c22c369 --- /dev/null +++ b/paddle/fluid/operators/expand_op.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/expand_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class ExpandOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + + std::vector expand_times = + ctx->Attrs().Get>("expand_times"); + auto x_dims = ctx->GetInputDim("X"); + + PADDLE_ENFORCE_EQ(static_cast(x_dims.size()), expand_times.size(), + "The number of Attr(expand_times)'s value must be equal " + "to the rank of Input(X)."); + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "The rank of Input(X) must not be greater than 6."); + + std::vector out_shape(x_dims.size()); + for (size_t i = 0; i < expand_times.size(); ++i) { + PADDLE_ENFORCE_GE(expand_times[i], 1, + "Each value of Attr(expand_times) should not be " + "less than 1."); + out_shape[i] = x_dims[i] * expand_times[i]; + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_shape)); + if (out_shape[0] == x_dims[0]) { + ctx->ShareLoD("X", "Out"); + } + } +}; + +class ExpandOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor). A tensor with rank in [1, 6]." + "X is the input to be expanded."); + AddOutput("Out", + "(Tensor, default Tensor). A tensor with rank in [1, 6]." + "The rank of Output(Out) have the same with Input(X). " + "After expanding, size of each dimension of Output(Out) is equal " + "to size of the corresponding dimension of Input(X) multiplying " + "the corresponding value given by Attr(expand_times)."); + AddAttr>("expand_times", + "Expand times number for each dimension."); + AddComment(R"DOC( +Expand operator tiles the input by given times number. You should set times +number for each dimension by providing attribute 'expand_times'. The rank of X +should be in [1, 6]. Please note that size of 'expand_times' must be the same +with X's rank. Following is a using case: + +Input(X) is a 3-D tensor with shape [2, 3, 1]: + + [ + [[1], [2], [3]], + [[4], [5], [6]] + ] + +Attr(expand_times): [1, 2, 2] + +Output(Out) is a 3-D tensor with shape [2, 6, 2]: + + [ + [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], + [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] + ] + +)DOC"); + } +}; + +class ExpandGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + std::vector expand_times = + ctx->Attrs().Get>("expand_times"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + for (size_t i = 0; i < expand_times.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i], + "Each dimension size of Input(Out@GRAD) should be " + "equal to multiplication of crroresponding dimension " + "size of Input(X) and Attr(expand_times) value."); + } + + auto x_grad_name = framework::GradVarName("X"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad, + ops::ExpandGradOp); +REGISTER_OP_CPU_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a9f39708beec3e0d32d65245c63f8ccf9df8604 --- /dev/null +++ b/paddle/fluid/operators/expand_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/expand_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CUDA_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8df1cd34d7dc5093b9bdcd3d015be4f9958d089d --- /dev/null +++ b/paddle/fluid/operators/expand_op.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +#define EXPAND_TEMPLATE(z, n, data) \ + case n + 1: { \ + Expand(context); \ + break; \ + } +#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) +#define COND(n) \ + BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \ + BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) +#define EXPAND_GRAD_CASE(n) \ + case n: { \ + ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ + } +#define EXPAND_GRAD_TEMPLATE(z, n, data) \ + BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) +#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~) + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenTensor = framework::EigenTensor; + +template +class ExpandKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rank = context.Input("X")->dims().size(); + switch (rank) { + REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_ENFORCE(false, + "Only support tensor with rank being between 1 and 6."); + } + } + + protected: + template + void Expand(const framework::ExecutionContext& context) const { + auto* in0 = context.Input("X"); + auto& expand_times = context.Attr>("expand_times"); + auto* out0 = context.Output("Out"); + Eigen::DSizes bcast_dims; + auto x_dims = in0->dims(); + for (size_t i = 0; i < expand_times.size(); ++i) { + bcast_dims[i] = expand_times[i]; + } + auto x = EigenTensor::From(*in0); + out0->mutable_data(context.GetPlace()); + auto y = EigenTensor::From(*out0); + auto& place = + *context.template device_context().eigen_device(); + y.device(place) = x.broadcast(bcast_dims); + } +}; + +template +class ExpandGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto& expand_times = context.Attr>("expand_times"); + auto x_dims = in0->dims(); + // 1. reshape_dims_vec is the broadcast parameter. For each dimension i, + // if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two + // dimensions [expand_times[i], x_dims[i]]. + // 2. reduce_dims_vec is the dimension parameter to compute gradients. For + // each dimension expanded, the gradients should be summed to original + // size. + std::vector reshape_dims_vec; + std::vector reduce_dims_vec; + for (size_t i = 0; i < expand_times.size(); ++i) { + if (expand_times[i] == 1) { + reshape_dims_vec.push_back(x_dims[i]); + } else { + if (x_dims[i] == 1) { + reduce_dims_vec.push_back(reshape_dims_vec.size()); + reshape_dims_vec.push_back(expand_times[i]); + } else { + reduce_dims_vec.push_back(reshape_dims_vec.size()); + reshape_dims_vec.push_back(expand_times[i]); + reshape_dims_vec.push_back(x_dims[i]); + } + } + } + + int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED + + reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1; + // no need reduce, just copy + if (reduce_dims_vec.size() == 0) { + auto* in0 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + out0->mutable_data(context.GetPlace()); + framework::Copy(*in0, context.GetPlace(), context.device_context(), out0); + } else { + switch (dims) { + REP_EXPAND_GRAD_TEMPLATE(72) + default: + PADDLE_ENFORCE( + false, "Only support tensor with rank being between 1 and 6."); + } + } + } + + protected: + template + void ExpandBackward(const framework::ExecutionContext& context, + const std::vector& reshape_dims_vec, + const std::vector& reduce_dims_vec) const { + size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1; + size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1; + PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(), + "Inconsistent size between template Dims and " + "reshape dimensions."); + PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(), + "Inconsistent size between template Dims and " + "reduce dimensions."); + auto* in0 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + auto x = EigenVector::Flatten(*(context.Input("X"))); + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenVector::Flatten(*out0); + Eigen::DSizes reshape_dims; + for (size_t i = 0; i < reshape_size; ++i) { + reshape_dims[i] = reshape_dims_vec[i]; + } + Eigen::DSizes reduce_dims; + for (size_t i = 0; i < reduce_size; ++i) { + reduce_dims[i] = reduce_dims_vec[i]; + } + auto out_grad = EigenVector::Flatten(*in0); + x_grad.device( + *context.template device_context().eigen_device()) = + out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b3f5f0d1d09a932e15936285f5cb226daa86e95 --- /dev/null +++ b/paddle/fluid/operators/feed_op.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class FeedOp : public framework::OperatorBase { + public: + FeedOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto feed_var_name = Input("X"); + auto *feed_var = scope.FindVar(feed_var_name); + + PADDLE_ENFORCE(feed_var != nullptr, + "Cannot find feed_var in scope, feed_var_name is %s", + feed_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto col = Attr("col"); + + VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var " + << out_name; + + auto &feed_list = feed_var->Get(); + auto &feed_item = feed_list.at(static_cast(col)); + auto *out_item = out_var->GetMutable(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + if (platform::is_same_place(feed_item.place(), place)) { + out_item->ShareDataWith(feed_item); + } else { + framework::Copy(feed_item, place, dev_ctx, out_item); + } + out_item->set_lod(feed_item.lod()); + } +}; + +class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of feed op"); + AddOutput("Out", "The output of feed op"); + AddAttr("col", "(int) The column of feed"); + AddComment(R"DOC( +Feed Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(feed, paddle::operators::FeedOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::FeedOpInfoMaker); diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..54e5892016cdb01f50189147a7453b868c5a48c0 --- /dev/null +++ b/paddle/fluid/operators/fetch_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class FetchOp : public framework::OperatorBase { + public: + FetchOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto fetch_var_name = Input("X"); + auto *fetch_var = scope.FindVar(fetch_var_name); + PADDLE_ENFORCE(fetch_var != nullptr, + "Cannot find fetch variable in scope, fetch_var_name is %s", + fetch_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto col = static_cast(Attr("col")); + + auto *fetch_list = out_var->GetMutable(); + auto &src_item = fetch_var->Get(); + + if (col >= fetch_list->size()) { + fetch_list->resize(col + 1); + } + auto &dst_item = fetch_list->at(col); + + // FIXME(yuyang18): Should we assume the fetch operator always generate + // CPU outputs? + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(src_item.place()); + + Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item); + dev_ctx.Wait(); + dst_item.set_lod(src_item.lod()); + + VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; + } +}; + +class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of fetch op"); + AddOutput("Out", "The output of fetch op"); + AddAttr("col", "(int) The column of fetch"); + AddComment(R"DOC( +Fetch Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(fetch, paddle::operators::FetchOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::FetchOpInfoMaker); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6992ba371c1dd61f7f6fa293be586818350fb3f --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h" + +namespace paddle { +namespace operators { + +class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("Input"), + "Input(Input) of FillConstantBatchSizeLikeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FillConstantBatchSizeLikeOp should not be null."); + + auto &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_GT(shape.size(), 0); + std::vector shape_int64(shape.size(), 0); + std::transform(shape.begin(), shape.end(), shape_int64.begin(), + [](int a) { return static_cast(a); }); + auto output_dim = framework::make_ddim(shape_int64); + + int input_dim_idx = ctx->Attrs().Get("input_dim_idx"); + PADDLE_ENFORCE_GE(input_dim_idx, 0); + PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx); + + int output_dim_idx = ctx->Attrs().Get("output_dim_idx"); + PADDLE_ENFORCE_GE(output_dim_idx, 0); + PADDLE_ENFORCE_GT(static_cast(shape.size()), output_dim_idx); + + output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx]; + ctx->SetOutputDim("Out", output_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.device_context()); + } +}; + +class FillConstantBatchSizeLikeOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddInput("Input", + "(Tensor) Tensor " + "whose dim_idx th dimension is used to specify the batch_size"); + AddOutput("Out", + "(Tensor) Tensor of specified shape will be filled " + "with the specified value"); + AddAttr>("shape", "(vector) The shape of the output"); + AddAttr("input_dim_idx", + "(int, default 0) The index of input's batch size dimension") + .SetDefault(0); + AddAttr("output_dim_idx", + "(int, default 0) The index of output's batch size dimension") + .SetDefault(0); + AddAttr("value", "(float, default 0) The value to be filled") + .SetDefault(0.0f); + AddComment(R"DOC( +FillConstantBatchSizeLike Operator. + +Fill up a variable with specified constant value. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOp, + paddle::framework::EmptyGradOpMaker, + ops::FillConstantBatchSizeLikeOpMaker); +REGISTER_OP_CPU_KERNEL( + fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4f4d2a50305e2582f23ceed931d655f9690e110 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..da4a20d99a13533019d57fca42b1b49780200b79 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + auto value = ctx.Attr("value"); + + math::SetConstant setter; + setter(ctx.template device_context(), out, + static_cast(value)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4bf6406e5716a6b65a234d1cd642b64dcc5726f --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class FillConstantInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FillConstantOp should not be null."); + auto &shape = ctx->Attrs().Get>("shape"); + ctx->SetOutputDim("Out", framework::make_ddim(shape)); + } +}; + +class FillConstantOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto data_type = + static_cast(Attr("dtype")); + auto value = Attr("value"); + auto force_cpu = Attr("force_cpu"); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + out.Resize(framework::make_ddim(Attr>("shape"))); + if (force_cpu) { + auto cpu = platform::CPUPlace(); + out.mutable_data(cpu, framework::ToTypeIndex(data_type)); + } else { + out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + math::set_constant(dev_ctx, &out, value); + } +}; + +class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddAttr>("shape", "(vector) The shape of the output"); + AddAttr("value", "(float, default 0) The value to be filled") + .SetDefault(0.0f); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false); + AddOutput("Out", + "(Tensor) Tensor of specified shape will be filled " + "with the specified value"); + AddComment(R"DOC( +FillConstantBatchSizeLike Operator. + +Fill up a variable with specified constant value. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, + ops::FillConstantInferShape, ops::FillConstantOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e318f37cf0bc945597b5aa7b384e53038c97786 --- /dev/null +++ b/paddle/fluid/operators/fill_op.cc @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct FillOpVisitor { + FillOpVisitor(framework::LoDTensor *tensor, const std::vector &value) + : tensor_(tensor), value_(value) {} + + template + void operator()() const { + platform::CPUPlace cpu; + auto *data = tensor_->mutable_data(cpu); + std::transform(value_.data(), value_.data() + tensor_->numel(), data, + [](float dat) { return static_cast(dat); }); + } + + framework::LoDTensor *tensor_; + const std::vector &value_; +}; + +class FillOp : public framework::OperatorBase { + public: + FillOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &out = + detail::Ref(detail::Ref(scope.FindVar(Output("Out")), + "Cannot find variable %s", Output("Out")) + .GetMutable()); + out.Resize(framework::make_ddim(Attr>("shape"))); + auto dtype = static_cast(Attr("dtype")); + platform::CPUPlace cpu; + auto force_cpu = Attr("force_cpu"); + out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype)); + + framework::LoDTensor tensor; + + if (force_cpu || platform::is_cpu_place(place)) { + tensor.ShareDataWith(out); + } else { + // Always make tensor in CPU memory. + tensor.Resize(out.dims()); + tensor.mutable_data(cpu, framework::ToTypeIndex(dtype)); + } + + framework::VisitDataType( + dtype, FillOpVisitor(&tensor, Attr>("value"))); + + if (!force_cpu && platform::is_gpu_place(place)) { + // Copy tensor to out + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Copy(tensor, place, dev_ctx, &out); + } + } +}; + +class FillOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC(Fill operator + +Fill an tensor with `value` and `shape`. The type of the tensor is specify by +`dtype`. +)DOC"); + AddOutput("Out", "(LoDTensor) The output tensor."); + AddAttr>( + "value", "The float values of tensor, which are flatten in row major"); + AddAttr>("shape", "The shape of output tensor"); + AddAttr("dtype", "The data type of output tensor, Default is float") + .SetDefault(framework::proto::DataType::FP32); + AddAttr("force_cpu", + "Whether the output tensor must be at CPU memory or not. " + "Default is false.") + .SetDefault(false); + } +}; + +class FillOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim( + "Out", + framework::make_ddim(context->Attrs().Get>("shape"))); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..958bfb1557d9fa39534caef594818aa97bbe03a6 --- /dev/null +++ b/paddle/fluid/operators/fill_zeros_like_op.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_zeros_like_op.h" + +namespace paddle { +namespace operators { + +class FillZerosLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FillZerosLikeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FillZerosLikeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of fill-zeros-like op."); + AddOutput("Out", "The variable will be filled up with zeros."); + AddComment(R"DOC( +FillZerosLike Operator. + +Fill up a variable with zeros. +The output will have the same size as the input. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, + ops::FillZerosLikeOpMaker); +REGISTER_OP_CPU_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..07078573d8aaa1d72f876f4be68ff70d8a56d8a1 --- /dev/null +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_zeros_like_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..141c3809e9aa3e2984bf802418f8ddf7d92fa446 --- /dev/null +++ b/paddle/fluid/operators/fill_zeros_like_op.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class FillZerosLikeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + math::SetConstant setter; + setter(context.template device_context(), out, + static_cast(0)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e72a173751e9b163b6083df474c2b46c76ed459d --- /dev/null +++ b/paddle/fluid/operators/ftrl_op.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/ftrl_op.h" + +namespace paddle { +namespace operators { + +class FTRLOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"), + "Input(SquaredAccumulator) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"), + "Input(LinearAccumulator) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of FTRL should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"), + "Output(SquaredAccumOut) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"), + "Output(LinearAccumOut) of FTRL should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"), + "Two input of FTRL Op's dimension must be same."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("SquaredAccumOut", param_dim); + ctx->SetOutputDim("LinearAccumOut", param_dim); + } +}; + +class FTRLOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter value that has to be updated."); + AddInput("SquaredAccumulator", + "(Tensor, default Tensor) " + "Accumulator that accumulates squared gradients."); + AddInput("LinearAccumulator", + "(Tensor, default Tensor) " + "Accumulator that accumulates linear gradients."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + AddOutput("SquaredAccumOut", + "(Tensor) Output accumulated squared" + " gradients."); + AddOutput("LinearAccumOut", + "(Tensor) Output accumulated linear" + " gradients."); + + AddAttr("l1", + "(float, default 0.0) " + "L1 regularization strength.") + .SetDefault(0.0f); + AddAttr("l2", + "(float, default 0.0) " + "L2 regularization strength.") + .SetDefault(0.0f); + AddAttr("lr_power", + "(float, default -0.5f) " + "Learning Rate Power.") + .SetDefault(-0.5f); + AddComment(R"DOC( +FTRL (Follow The Regularized Leader) Operator. + +Optimizer that implements the FTRL algorithm: + +$$ +new\_accum = squared\_accum + grad^2 \\ +if (lr\_power == -0.5) { + linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) / + (learning\_rate * param) \\ +} else { + linear\_accum += grad - + (new\_accum^{-lr\_power} - accum^{-lr\_power}) / + (learning\_rate * param) \\ +} + +x = (l1 * sign(linear\_accum) - linear\_accum) +if (lr\_power == -0.5) { + y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\ + pre\_shrink = \frac{x}{y} \\ + param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\ +} else { + y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\ + pre\_shrink = \frac{x}{y} \\ + param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\ +} +squared\_accum += grad^2; +$$ + +The paper that proposed Follow The Regularized Leader (FTRL): +(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker); +REGISTER_OP_CPU_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/ftrl_op.cu b/paddle/fluid/operators/ftrl_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..dbdfcb927e0aff373a716c5e0eace96bec38e9ad --- /dev/null +++ b/paddle/fluid/operators/ftrl_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/ftrl_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0a9405fcef1fa405ab14ba7c797b99c2259892f7 --- /dev/null +++ b/paddle/fluid/operators/ftrl_op.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class FTRLOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + auto* sq_accum_out = ctx.Output("SquaredAccumOut"); + auto* lin_accum_out = ctx.Output("LinearAccumOut"); + + param_out->mutable_data(ctx.GetPlace()); + sq_accum_out->mutable_data(ctx.GetPlace()); + lin_accum_out->mutable_data(ctx.GetPlace()); + + auto grad = ctx.Input("Grad"); + + auto l1 = static_cast(ctx.Attr("l1")); + auto l2 = static_cast(ctx.Attr("l2")); + auto lr_power = static_cast(ctx.Attr("lr_power")); + + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto sq_accum = + EigenVector::Flatten(*ctx.Input("SquaredAccumulator")); + auto lin_accum = + EigenVector::Flatten(*ctx.Input("LinearAccumulator")); + auto g = EigenVector::Flatten(*grad); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + + auto p_out = EigenVector::Flatten(*param_out); + auto s_acc_out = EigenVector::Flatten(*sq_accum_out); + auto l_acc_out = EigenVector::Flatten(*lin_accum_out); + auto& place = *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + auto new_accum = sq_accum + g * g; + // Special case for lr_power = -0.5 + if (lr_power == static_cast(-0.5)) { + l_acc_out.device(place) = + lin_accum + g - + ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p; + } else { + l_acc_out.device(place) = + lin_accum + g - + ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) / + lr.broadcast(grad_dsize)) * + p; + } + + auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out); + if (lr_power == static_cast(-0.5)) { + auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) + + l_acc_out.constant(static_cast(2) * l2); + auto pre_shrink = x / y; + p_out.device(place) = + (l_acc_out.abs() > l_acc_out.constant(l1)) + .select(pre_shrink, p.constant(static_cast(0))); + } else { + auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) + + l_acc_out.constant(static_cast(2) * l2); + auto pre_shrink = x / y; + p_out.device(place) = + (l_acc_out.abs() > l_acc_out.constant(l1)) + .select(pre_shrink, p.constant(static_cast(0))); + } + + s_acc_out.device(place) = sq_accum + g * g; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..af5898e29ecaed5a4d2cf8372a3bb20f192fc776 --- /dev/null +++ b/paddle/fluid/operators/gather.cu.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::DeviceContext; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output, + size_t index_size, size_t slice_size) { + CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + int gather_i = indices[indices_i]; + int params_i = gather_i * slice_size + slice_i; + *(output + i) = *(params + params_i); + } +} + +/** + * A thin wrapper on gpu tensor + * Return a new tensor from source tensor, gathered according to index + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + // PADDLE_ENFORCE(platform::is_gpu_place(place)); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + framework::DDim output_dims(src_dims); + output_dims[0] = index_size; + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + int block = 512; + int n = slice_size * index_size; + int grid = (n + block - 1) / block; + + GatherCUDAKernel<<< + grid, block, 0, + reinterpret_cast(ctx).stream()>>>( + p_src, p_index, p_output, index_size, slice_size); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h new file mode 100644 index 0000000000000000000000000000000000000000..287732eeb6e5249f631bc3e39cd18bc050f9fc3b --- /dev/null +++ b/paddle/fluid/operators/gather.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +/** + * A thin wrapper for gathering on cpu tensor + * Return a new tensor from source tensor, gathered according to index + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + framework::DDim output_dims(src_dims); + output_dims[0] = index_size; + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + + for (int i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dceeb71ee3552bcb462014b5f08a59d4406497ad --- /dev/null +++ b/paddle/fluid/operators/gather_op.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather_op.h" +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class GatherOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of GatherOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Index"), + "Input(Index) of GatherOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of GatherOp should not be null."); + + auto index_dims = ctx->GetInputDim("Index"); + PADDLE_ENFORCE(index_dims.size() == 1); + int batch_size = ctx->GetInputDim("Index")[0]; + PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0"); + framework::DDim output_dims(ctx->GetInputDim("X")); + output_dims[0] = batch_size; + ctx->SetOutputDim("Out", output_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class GatherGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class GatherOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The source input of gather op"); + AddInput("Index", "The index input of gather op"); + AddOutput("Out", "The output of gather op"); + AddComment(R"DOC( +Gather Operator. + +$Out = X[Index]$ + +Out is obtained by gathering entries of the outer-most dimension +of X indexed by Index and concatenate them together. + +Example: + +X = [[1, 2], + [3, 4], + [5, 6]] + +Index = [[1, 2]] + +Then: + +Out = [[3, 4], + [5, 6]] + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad, + ops::GatherGradOp); +REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel); +REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel); diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..484f4232624e862aff7c0aff337b4e5df65d5be3 --- /dev/null +++ b/paddle/fluid/operators/gather_op.cu @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gather.cu.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/gather_op.h" +#include "scatter.cu.h" + +namespace paddle { +namespace operators { + +template +class GatherOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *output = ctx.Output("Out"); + + output->mutable_data(ctx.GetPlace()); + + GPUGather(ctx.device_context(), *x, *index, output); + } +}; + +template +class GatherGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *Index = ctx.Input("Index"); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dO = ctx.Input(framework::GradVarName("Out")); + auto *x = ctx.Input("X"); + + dX->mutable_data(ctx.GetPlace()); + auto dxt = framework::EigenVector::Flatten(*dX); + auto &place = *ctx.template device_context() + .eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + + GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7ba4a31c81be025978c6c2a325792eea2eb353a7 --- /dev/null +++ b/paddle/fluid/operators/gather_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "gather.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class GatherOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *output = ctx.Output("Out"); + + output->mutable_data(ctx.GetPlace()); + + CPUGather(ctx.device_context(), *x, *index, output); + } +}; + +template +class GatherGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + + auto *Index = ctx.Input("Index"); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dO = ctx.Input(framework::GradVarName("Out")); + + dX->mutable_data(ctx.GetPlace()); + auto dxt = framework::EigenVector::Flatten(*dX); + auto &place = *ctx.template device_context() + .eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + + ScatterAssign(ctx.device_context(), *dO, *Index, dX); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d86cf5ce334705d16435f542cc33be454edabb7 --- /dev/null +++ b/paddle/fluid/operators/gather_test.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#include + +TEST(Gather, GatherData) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators; + + Tensor* src = new Tensor(); + Tensor* index = new Tensor(); + Tensor* output = new Tensor(); + + int* p_src = nullptr; + int* p_index = nullptr; + p_src = src->mutable_data(make_ddim({3, 4}), CPUPlace()); + p_index = index->mutable_data(make_ddim({2}), CPUPlace()); + + for (int i = 0; i < 12; ++i) p_src[i] = i; + p_index[0] = 1; + p_index[1] = 0; + + int* p_output = output->mutable_data(make_ddim({2, 4}), CPUPlace()); + + auto* cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext ctx(*cpu_place); + CPUGather(ctx, *src, *index, output); + + for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); + for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); + + delete src; + delete index; + delete output; +} diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b090f8759765039eadfc900361bcdabe215c2225 --- /dev/null +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CPUGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + + unsigned int seed = static_cast(context.Attr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::normal_distribution dist(mean, std); + int64_t size = tensor->numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + } +}; + +class GaussianRandomOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of GaussianRandomOp should not be null."); + auto shape = ctx->Attrs().Get>("shape"); + std::vector temp; + temp.reserve(shape.size()); + for (auto dim : shape) { + temp.push_back(static_cast(dim)); + } + PADDLE_ENFORCE(shape.size() > 0UL, + "shape can be one int or array. shape must be set."); + ctx->SetOutputDim("Out", framework::make_ddim(temp)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.device_context()); + } +}; + +class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "Output matrix of gaussian random op"); + + AddAttr>("shape", + "(vector) " + "The dimension of random tensor."); + AddAttr("mean", + "(float, default 0.0) " + "mean of random tensor.") + .SetDefault(.0f); + AddAttr("std", + "(float, default 1.0) " + "std of random tensor.") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed of generator." + "0 means use system wide seed.") + .SetDefault(0); + AddAttr("dtype", + "(int, default 5(FP32)) " + "Output data type.") + .SetDefault(framework::proto::DataType::FP32); + + AddComment(R"DOC( +GaussianRandom Operator. + +Used to initialize tensors with gaussian random generator. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, + ops::GaussianRandomOpMaker); +REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..70d655d4bb259bf33765fa42e46a19510ffca35d --- /dev/null +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +struct GaussianGenerator { + T mean_, std_; + unsigned int seed_; + + __host__ __device__ GaussianGenerator(T mean, T std, int seed) + : mean_(mean), std_(std), seed_(seed) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::normal_distribution dist(mean_, std_); + rng.discard(n); + return dist(rng); + } +}; + +template +class GPUGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); + if (seed == 0) { + std::random_device rd; + seed = rd(); + } + T mean = static_cast(context.Attr("mean")); + T std = static_cast(context.Attr("std")); + thrust::counting_iterator index_sequence_begin(0); + int64_t size = tensor->numel(); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + GaussianGenerator(mean, std, seed)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(gaussian_random, + paddle::operators::GPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba908e472bbc165a244d8543713f1dbf293abb48 --- /dev/null +++ b/paddle/fluid/operators/get_places_op.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif + +namespace paddle { +namespace operators { + +static size_t CUDADevCount() { +#ifdef PADDLE_WITH_CUDA + return platform::GetCUDADeviceCount(); +#else + return 0UL; +#endif +} + +class GetPlacesOp : public framework::OperatorBase { + public: + GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + bool is_gpu; + if (Attr("device_type") == "AUTO") { + is_gpu = platform::is_gpu_place(place); + } else { + is_gpu = Attr("device_type") == "CUDA"; + } + auto device_count = static_cast(Attr("device_count")); + if (device_count == 0) { + device_count = + is_gpu ? CUDADevCount() : std::thread::hardware_concurrency(); + } + PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count", + is_gpu ? "GPU" : "CPU"); + + auto out_var_name = Output("Out"); + auto &places = + *(detail::Ref(scope.FindVar(out_var_name), + "Output variable %s cannot be found", out_var_name) + .GetMutable()); + places.reserve(device_count); + if (is_gpu) { + PADDLE_ENFORCE_LE(device_count, CUDADevCount(), + "Only %d CUDA devices found, cannot set to %d", + CUDADevCount(), device_count); + for (size_t i = 0; i < device_count; ++i) { + places.emplace_back(platform::CUDAPlace(static_cast(i))); + } + } else { + for (size_t i = 0; i < device_count; ++i) { + places.emplace_back(platform::CPUPlace()); + } + } + } +}; + +class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "vector of Place"); + AddAttr("device_count", "device count").SetDefault(0); + AddAttr("device_type", "device type") + .InEnum({"CUDA", "CPU", "AUTO"}) + .SetDefault("AUTO"); + AddComment(R"DOC( +Returns a list of places based on flags. The list will be used for parallel +execution. +)DOC"); + } +}; + +class GetPlacesInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o_name : op_desc.Output("Out")) { + block->FindRecursiveOrCreateVar(o_name).SetType( + framework::proto::VarDesc::PLACE_LIST); + } + } +}; + +class GetPlacesInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + // Do nothing + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker, + ops::GetPlacesInferVarType, ops::GetPlacesInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1436e55b0e13b8b327a61e7c91294fa958b146c4 --- /dev/null +++ b/paddle/fluid/operators/gru_op.cc @@ -0,0 +1,224 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gru_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class GRUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(%s) of GRUOp should not be null.", "BatchGate"); + PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"), + "Output(%s) of GRUOp should not be null.", + "BatchResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"), + "Output(%s) of GRUOp should not be null.", "BatchHidden"); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(%s) of GRUOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto weight_dims = ctx->GetInputDim("Weight"); + int input_size = input_dims[1]; + int frame_size = weight_dims[0]; + PADDLE_ENFORCE_EQ(input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUOp."); + PADDLE_ENFORCE_EQ( + weight_dims[1], frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("H0")) { + auto h0_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, + "The width of H0 must be equal to frame_size."); + } + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + ctx->SetOutputDim("BatchGate", input_dims); + ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size}); + ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", {input_dims[0], frame_size}); + ctx->ShareLoD("Input", "Hidden"); + } +}; + +class GRUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) The first input is a LodTensor, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTenosr is a matrix with shape (T X 3D), where, T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) The initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size, D is the hidden size.") + .AsDispensable(); + AddInput( + "Weight", + "(Tensor) The learnable hidden-hidden weight matrix with shape " + "(D x 3D), where D is the hidden size. The elements continuous in " + "memory can be divided into two parts. The first part are weights of " + "the update gate and reset gate with shape (D x 2D), and the second " + "part are weights of output candidate with shape (D x D)."); + AddInput("Bias", + "(Tensor, optional) Bias vector with shape (1 x 3D) concating " + "bias of the update gate, reset gate and output candidate.") + .AsDispensable(); + AddOutput("BatchGate", + "(LoDTensor) To compute with batches, sequence data will be " + "reorganized into several successive batches each containing " + "data from the same time step. The LoDTensor BatchGate contains " + "the update gate, reset gate and output candidate values " + "organized in batches. The LoD size is 2. The first LoD contains " + "the batch offsets and the second LoD contains the indexes in " + "the raw sequence data.") + .AsIntermediate(); + AddOutput( + "BatchResetHiddenPrev", + "(LoDTensor) The reseted hidden state LoDTensor organized in batches. " + "This LoDTensor is a matrix with shape (T X D) and has the same LoD " + "with `BatchGate`.") + .AsIntermediate(); + AddOutput( + "BatchHidden", + "(LoDTensor) The hidden state LoDTensor organized in batches. " + "This LoDTensor is a matrix with shape (T X D) and has the same LoD " + "with `BatchGate`.") + .AsIntermediate(); + AddOutput( + "Hidden", + "(LoDTensor) the hidden state LoDTensor organized in sequences. " + "This LoDTensor is a matrix with shape (T X D) and has the same LoD " + "with `BatchGate`."); + AddAttr("activation", + "(string, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault("tanh"); + AddAttr( + "gate_activation", + "(string, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault("sigmoid"); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed GRU.") + .SetDefault(false); + AddComment(R"DOC( +GRU Operator implements part calculations of the complete GRU as following: + +$$ +update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ +reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ +output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ +output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) +$$ + +@note To implement the complete GRU, fully-connected operator must be used +before to feed xu, xr and xc as the Input of GRU operator. +)DOC"); + } +}; + +class GRUGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUGradOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUGradOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(%s) of GRUGradOp should not be null.", "BatchGate"); + PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"), + "Input(%s) of GRUGradOp should not be null.", + "BatchResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("BatchHidden"), + "Input(%s) of GRUOp should not be null.", "BatchHidden"); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(%s) of GRUGradOp should not be null.", "Hidden"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), + "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto weight_dims = ctx->GetInputDim("Weight"); + int input_size = input_dims[1]; + int frame_size = weight_dims[0]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ(input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("H0")) { + auto h0_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, + "The width of H0 must be equal to frame_size."); + auto h0_grad_name = framework::GradVarName("H0"); + if (ctx->HasOutput(h0_grad_name)) + ctx->SetOutputDim(h0_grad_name, h0_dims); + } + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + auto input_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(input_grad_name)) + ctx->SetOutputDim(input_grad_name, input_dims); + auto weight_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(weight_grad_name)) + ctx->SetOutputDim(weight_grad_name, weight_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp); +REGISTER_OP_CPU_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CPU_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e908d01d2920af8bdbbdc694944e62a86bad327a --- /dev/null +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gru_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h new file mode 100644 index 0000000000000000000000000000000000000000..37f3ae1a837c77bd5e3696abbd9ae14257a7f5d7 --- /dev/null +++ b/paddle/fluid/operators/gru_op.h @@ -0,0 +1,261 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index_lod, *dst, indexed_src); +} + +template +class GRUKernel : public framework::OpKernel { + public: + void BatchCompute(const framework::ExecutionContext& context) const { + auto* input = context.Input("Input"); + auto* h0 = context.Input("H0"); + auto* weight = context.Input("Weight"); + const T* weight_data = weight->data(); + auto* bias = context.Input("Bias"); + auto* batch_gate = context.Output("BatchGate"); + batch_gate->mutable_data(context.GetPlace()); + auto* batch_reset_hidden_prev = + context.Output("BatchResetHiddenPrev"); + batch_reset_hidden_prev->mutable_data(context.GetPlace()); + auto* batch_hidden = context.Output("BatchHidden"); + batch_hidden->mutable_data(context.GetPlace()); + auto* hidden = context.Output("Hidden"); + hidden->mutable_data(context.GetPlace()); + + context.ShareLoD("Input", "Hidden"); + + auto hidden_dims = hidden->dims(); + + bool is_reverse = context.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = context.template device_context(); + to_batch(dev_ctx, *input, *batch_gate, true, is_reverse); + + if (bias) { + math::RowwiseAdd add_bias; + add_bias(dev_ctx, *batch_gate, *bias, batch_gate); + } + + int frame_size = hidden_dims[1]; + math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = + const_cast(weight_data + 2 * frame_size * frame_size); + Tensor ordered_h0; + + framework::Vector order(batch_gate->lod()[2]); + + if (h0) { + // Since the batch computing for GRU reorders the input sequences + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState( + context.template device_context(), *h0, order, + &ordered_h0, true); + gru_value.prev_out_value = ordered_h0.data(); + } else { + gru_value.prev_out_value = nullptr; + } + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto active_node = math::detail::GetActivationType( + context.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + context.Attr("gate_activation")); + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + math::GRUUnitFunctor::compute( + dev_ctx, gru_value, frame_size, cur_batch_size, active_node, + active_gate); + gru_value.prev_out_value = gru_value.output_value; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_hidden->set_lod(batch_gate->lod()); + to_seq(dev_ctx, *batch_hidden, *hidden); + } + + void Compute(const framework::ExecutionContext& context) const override { + BatchCompute(context); + } +}; + +template +class GRUGradKernel : public framework::OpKernel { + public: + void BatchCompute(const framework::ExecutionContext& context) const { + auto* h0 = context.Input("H0"); + auto* weight = context.Input("Weight"); + const T* weight_data = weight->data(); + auto* batch_gate = context.Input("BatchGate"); + auto* batch_reset_hidden_prev = + context.Input("BatchResetHiddenPrev"); + auto* batch_hidden = context.Input("BatchHidden"); + auto* hidden = context.Input("Hidden"); + auto* hidden_grad = + context.Input(framework::GradVarName("Hidden")); + auto* input_grad = + context.Output(framework::GradVarName("Input")); + auto* h0_grad = context.Output(framework::GradVarName("H0")); + auto* weight_grad = + context.Output(framework::GradVarName("Weight")); + auto* bias_grad = context.Output(framework::GradVarName("Bias")); + + auto gate_dims = batch_gate->dims(); + auto hidden_dims = hidden->dims(); + int frame_size = hidden_dims[1]; + + math::LoDTensor2BatchFunctor to_batch; + LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; + batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); + batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); + batch_reset_hidden_prev_grad.mutable_data(hidden_dims, + context.GetPlace()); + math::SetConstant zero; + auto& dev_ctx = context.template device_context(); + zero(dev_ctx, &batch_hidden_grad, static_cast(0.0)); + zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); + zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); + + Tensor ordered_h0, ordered_h0_grad; + + framework::Vector order(batch_gate->lod()[2]); + + if (h0) { + ReorderInitState(dev_ctx, *h0, order, &ordered_h0, + true); + } + if (h0_grad) { + ordered_h0_grad.mutable_data(h0_grad->dims(), context.GetPlace()); + zero(context.template device_context(), &ordered_h0_grad, + static_cast(0.0)); + } + + bool is_reverse = context.Attr("is_reverse"); + batch_hidden_grad.set_lod(batch_hidden->lod()); + to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); + + math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = + const_cast(weight_data + 2 * frame_size * frame_size); + + math::GRUMetaGrad gru_grad; + if (weight_grad) { + gru_grad.gate_weight_grad = + weight_grad->mutable_data(context.GetPlace()); + zero(dev_ctx, weight_grad, static_cast(0.0)); + gru_grad.state_weight_grad = + weight_grad->data() + 2 * frame_size * frame_size; + } else { + gru_grad.gate_weight_grad = nullptr; + gru_grad.state_weight_grad = nullptr; + } + + auto batch_starts = batch_hidden_grad.lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto active_node = math::detail::GetActivationType( + context.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + context.Attr("gate_activation")); + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batch_gate->Slice(bstart, bend); + gru_value.gate_value = gate_t.data(); + Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend); + gru_grad.output_grad = hidden_grad_t.data(); + Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend); + gru_grad.gate_grad = gate_grad_t.data(); + Tensor reset_hidden_prev_grad_t = + batch_reset_hidden_prev_grad.Slice(bstart, bend); + gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data(); + if (n == 0) { + gru_value.prev_out_value = h0 ? ordered_h0.data() : nullptr; + gru_grad.prev_out_grad = + h0 && h0_grad ? ordered_h0_grad.data() : nullptr; + } else { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart); + gru_value.prev_out_value = hidden_prev_t.data(); + Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart); + gru_grad.prev_out_grad = hidden_prev_grad_t.data(); + } + + math::GRUUnitGradFunctor::compute( + dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node, + active_gate); + } + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + math::Batch2LoDTensorFunctor to_seq; + batch_gate_grad.set_lod(batch_gate->lod()); + to_seq(dev_ctx, batch_gate_grad, *input_grad); + } + if (bias_grad) { + bias_grad->mutable_data(context.GetPlace()); + math::ColwiseSum col_sum; + col_sum(dev_ctx, batch_gate_grad, bias_grad); + } + if (h0 && h0_grad) { + ReorderInitState(dev_ctx, ordered_h0_grad, order, + h0_grad, false); + } + } + + void Compute(const framework::ExecutionContext& context) const override { + BatchCompute(context); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..21ad3aeb492ee18d465edea6ec0fca7e49d1366b --- /dev/null +++ b/paddle/fluid/operators/gru_unit_op.cc @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gru_unit_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class GRUUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUUnitOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"), + "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUUnitOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasOutput("Gate"), + "Output(%s) of GRUUnitOp should not be null.", "Gate"); + PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"), + "Output(%s) of GRUUnitOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(%s) of GRUUnitOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); + auto weight_dims = ctx->GetInputDim("Weight"); + int batch_size = input_dims[0]; + int input_size = input_dims[1]; + int frame_size = hidden_prev_dims[1]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ( + input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUUnitOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + ctx->SetOutputDim("Gate", {batch_size, frame_size * 3}); + ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size}); + ctx->SetOutputDim("Hidden", {batch_size, frame_size}); + } +}; + +class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " + "input."); + AddInput("HiddenPrev", + "(Tensor) Matrix with shape [batch_size, frame_size] for the " + "states of previous time step."); + AddInput( + "Weight", + "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. " + "The elements continuous in memory can be divided into two parts. " + "The first part are weights of the update gate and reset gate " + "with shape [frame_size, frame_size * 2], and the second part are " + "weights of output candidate with shape [frame_size, frame_size]."); + AddInput( + "Bias", + "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating " + "bias of the update gate, reset gate and output candidate.") + .AsDispensable(); + AddOutput("Gate", + "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " + "output of update gate, reset gate and output candidate.") + .AsIntermediate(); + AddOutput("ResetHiddenPrev", + "(Tensor) Matrix with shape [batch_size, frame_size] for the " + "reseted hidden state of previous time step.") + .AsIntermediate(); + AddOutput("Hidden", + "(Tensor) The GRU hidden state of the current time step " + "with shape [batch_size, frame_size]."); + AddAttr("activation", + "(enum int, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault(tanh) + .InEnum({identity, sigmoid, tanh, relu}); + AddAttr("gate_activation", + "(enum int, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault(sigmoid) + .InEnum({identity, sigmoid, tanh, relu}); + AddComment(R"DOC( +GRUUnit Operator implements partial calculations of the GRU unit as following: + +$$ +update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ +reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ +output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ +output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) +$$ + +which is same as one time step of GRU Operator. + +@note To implement the complete GRU unit, fully-connected operator must be +used before to feed xu, xr and xc as the Input of GRUUnit operator. + +)DOC"); + } +}; + +class GRUUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUUnitGradOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"), + "Input(%s) of GRUUnitGradOp should not be null.", + "HiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUUnitGradOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasInput("Gate"), + "Input(%s) of GRUUnitGradOp should not be null.", "Gate"); + PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"), + "Input(%s) of GRUUnitGradOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(%s) of GRUUnitGradOp should not be null.", "Hidden"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), + "Input(%s@GRAD) of GRUUnitGradOp should not be null.", + "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); + auto weight_dims = ctx->GetInputDim("Weight"); + // int batch_size = input_dims[0]; + int input_size = input_dims[1]; + int frame_size = hidden_prev_dims[1]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ( + input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUUnitOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + auto input_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(input_grad_name)) + ctx->SetOutputDim(input_grad_name, input_dims); + auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev"); + if (ctx->HasOutput(hidden_prev_grad_name)) + ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims); + auto weight_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(weight_grad_name)) + ctx->SetOutputDim(weight_grad_name, weight_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad, + ops::GRUUnitGradOp); +REGISTER_OP_CPU_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..88b707fd1314ec5f12b507edc64a56ea9895a9d6 --- /dev/null +++ b/paddle/fluid/operators/gru_unit_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/gru_unit_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c4031a5a575e59488c1a6cd77c3da88ea6af423e --- /dev/null +++ b/paddle/fluid/operators/gru_unit_op.h @@ -0,0 +1,244 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + +enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; + +template +class GRUUnitKernel : public framework::OpKernel { + public: + template + void ActCompute(const int act_type, const Device& d, X x, Y y) const { + if (act_type == identity) + y.device(d) = x; + else if (act_type == sigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == tanh) + TanhFunctor()(d, x, y); + else if (act_type == relu) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + auto* hidden_prev = context.Input("HiddenPrev"); + auto* weight = context.Input("Weight"); + auto* bias = context.Input("Bias"); + auto* gate = context.Output("Gate"); + gate->mutable_data(context.GetPlace()); + auto* reset_hidden_prev = context.Output("ResetHiddenPrev"); + reset_hidden_prev->mutable_data(context.GetPlace()); + auto* hidden = context.Output("Hidden"); + hidden->mutable_data(context.GetPlace()); + + int batch_size = input->dims()[0]; + int frame_size = hidden_prev->dims()[1]; + + auto x = EigenMatrix::From(*input); + auto h_p = EigenMatrix::From(*hidden_prev); + auto g = EigenMatrix::From(*gate); + auto r_h_p = EigenMatrix::From(*reset_hidden_prev); + auto h = EigenMatrix::From(*hidden); + auto& place = + *context.template device_context().eigen_device(); + + // calculate unactivated gate outputs + if (bias) { + auto b = EigenMatrix::From(*bias); + g.device(place) = x + + b.reshape(Eigen::array({{1, frame_size * 3}})) + .broadcast(Eigen::array({{batch_size, 1}})); + } else { + g.device(place) = x; + } + const T* hidden_prev_data = hidden_prev->data(); + const T* weight_data = weight->data(); + T* gate_data = gate->data(); + T* reset_hidden_prev_data = reset_hidden_prev->data(); + math::gemm( + context.template device_context(), false, false, + batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size, + weight_data, frame_size * 2, 1, gate_data, frame_size * 3); + + // calculate activited gate + Eigen::array extents({{batch_size, frame_size}}); + Eigen::array u_offsets({{0, 0}}); + ActCompute(context.Attr("gate_activation"), place, + g.slice(u_offsets, extents), g.slice(u_offsets, extents)); + auto u = g.slice(u_offsets, extents); // update gate + Eigen::array r_offsets({{0, frame_size}}); + ActCompute(context.Attr("gate_activation"), place, + g.slice(r_offsets, extents), g.slice(r_offsets, extents)); + auto r = g.slice(r_offsets, extents); // reset gate + r_h_p.device(place) = r * h_p; // reset previous hidden state + math::gemm( + context.template device_context(), false, false, + batch_size, frame_size, frame_size, 1, reset_hidden_prev_data, + frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1, + gate_data + frame_size * 2, frame_size * 3); + + Eigen::array c_offsets({{0, frame_size * 2}}); + ActCompute(context.Attr("activation"), place, + g.slice(c_offsets, extents), g.slice(c_offsets, extents)); + auto c = g.slice(c_offsets, extents); // output candidate + + // calculate final output + h.device(place) = u * (c - h_p) + h_p; + } +}; + +template +class GRUUnitGradKernel : public framework::OpKernel { + public: + template + void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx, + DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == identity) + dx.device(d) = dy; + else if (act_type == sigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == tanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == relu) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + auto* hidden_prev = context.Input("HiddenPrev"); + auto* weight = context.Input("Weight"); + auto* gate = context.Input("Gate"); + auto* reset_hidden_prev = context.Input("ResetHiddenPrev"); + auto* hidden_grad = context.Input(framework::GradVarName("Hidden")); + auto* input_grad = context.Output(framework::GradVarName("Input")); + auto* hidden_prev_grad = + context.Output(framework::GradVarName("HiddenPrev")); + auto* weight_grad = + context.Output(framework::GradVarName("Weight")); + auto* bias_grad = context.Output(framework::GradVarName("Bias")); + Tensor gate_grad; + Tensor reset_hidden_prev_grad; + + const T* hidden_prev_data = hidden_prev->data(); + const T* weight_data = weight->data(); + T* gate_grad_data = + gate_grad.mutable_data(input->dims(), context.GetPlace()); + const T* reset_hidden_prev_data = reset_hidden_prev->data(); + T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data( + reset_hidden_prev->dims(), context.GetPlace()); + + auto h_p = EigenMatrix::From(*hidden_prev); + auto g = EigenMatrix::From(*gate); + auto d_h = EigenMatrix::From(*hidden_grad); + auto d_g = EigenMatrix::From(gate_grad); + auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); + auto& place = + *context.template device_context().eigen_device(); + + int batch_size = input->dims()[0]; + int frame_size = hidden_prev->dims()[1]; + + Eigen::array extents({{batch_size, frame_size}}); + Eigen::array u_offsets({{0, 0}}); + auto u = g.slice(u_offsets, extents); // update gate + Eigen::array r_offsets({{0, frame_size}}); + auto r = g.slice(r_offsets, extents); // reset gate + Eigen::array c_offsets({{0, frame_size * 2}}); + auto c = g.slice(c_offsets, extents); // output candidate + + // backward for unactivated update gate + ActGradCompute(context.Attr("gate_activation"), place, u, u, + d_g.slice(u_offsets, extents), d_h * (c - h_p)); + // backward for unactivated output candidate + ActGradCompute(context.Attr("activation"), place, c, c, + d_g.slice(c_offsets, extents), d_h * u); + // backward for reset_hidden_prev + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2, + frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size, + 0, reset_hidden_prev_grad_data, frame_size); + // backward for unactivated reset gate + ActGradCompute(context.Attr("gate_activation"), place, r, r, + d_g.slice(r_offsets, extents), d_r_h_p * h_p); + // backward for weight + if (weight_grad) { + T* weight_grad_data = weight_grad->mutable_data(context.GetPlace()); + // backward for state_weight + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size, batch_size, 1, reset_hidden_prev_data, + frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0, + weight_grad_data + frame_size * frame_size * 2, frame_size); + + // backward for update_gate_weight and reset_gate_weight + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size * 2, batch_size, 1, hidden_prev_data, + frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data, + frame_size * 2); + } + // backward for hidden_prev + if (hidden_prev_grad) { + T* hidden_prev_grad_data = + hidden_prev_grad->mutable_data(context.GetPlace()); + auto d_h_p = EigenMatrix::From(*hidden_prev_grad); + d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size * 2, 1, gate_grad_data, + frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data, + frame_size); + } + // backward for input + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + auto d_x = EigenMatrix::From(*input_grad); + d_x.device(place) = d_g; + } + // backward for bias + if (bias_grad) { + bias_grad->mutable_data(context.GetPlace()); + auto d_b = EigenVector::Flatten(*bias_grad); + d_b.device(place) = d_g.sum(Eigen::array({{0}})); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f644c22c9f1bdde6edc0126186361baccfbfcfb0 --- /dev/null +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/hinge_loss_op.h" + +namespace paddle { +namespace operators { + +class HingeLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) must be initialized."); + + auto pred_dims = ctx->GetInputDim("Logits"); + auto label_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(pred_dims, label_dims); + PADDLE_ENFORCE_EQ(pred_dims.size(), 2, + "The rank of Input(Logits) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(pred_dims[1], 1, + "Each row of Input(Logits) contains a real value, " + "so the 2nd dimension of Input(Logits) must be 1."); + + ctx->SetOutputDim("Loss", {pred_dims[0], 1}); + ctx->ShareLoD("Logits", "Loss"); + } +}; + +template +class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "The input value (Logits) of Hinge loss op." + "Logits is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Labels", + "The target value (Labels) of Hinge loss op." + "Labels is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Loss", + "The output tensor with shape [batch_size, 1] " + "which represents the hinge loss."); + AddComment(R"DOC( +HingeLoss Operator. + +Let x be a logit (prediction) and y be the actual label. The logit can +take any values from (-inf, inf), but the labels should be either -1 or 1. +Then, the hinge loss is computed as follows: + +$$ +L_(x, y) = max(1 - y.x, 0) +$$ + +Note that the labels passed as input will have values as either 0 or 1. + +)DOC"); + } +}; + +class HingeLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Input(Logits@GRAD) should not be null."); + + auto pred_dims = ctx->GetInputDim("Logits"); + auto lab_dims = ctx->GetInputDim("Labels"); + auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); + + PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); + + auto pred_grad_name = framework::GradVarName("Logits"); + ctx->SetOutputDim(pred_grad_name, pred_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, + hinge_loss_grad, ops::HingeLossGradOp); +REGISTER_OP_CPU_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..cb53a9b7f4aaeeee71ed81c507feb0e9c946a541 --- /dev/null +++ b/paddle/fluid/operators/hinge_loss_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/hinge_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1e924d236ea1d3208a8f425f76be8d455714a51f --- /dev/null +++ b/paddle/fluid/operators/hinge_loss_op.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class HingeLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* pred = context.Input("Logits"); + auto* label = context.Input("Labels"); + auto* loss = context.Output("Loss"); + auto& place = + *context.template device_context().eigen_device(); + + auto x = framework::EigenVector::Flatten(*pred); + auto y = framework::EigenVector::Flatten(*label); + loss->mutable_data(context.GetPlace()); + auto l = framework::EigenVector::Flatten(*loss); + l.device(place) = + (static_cast(1) - x * (static_cast(2) * y - static_cast(1))) + .cwiseMax(static_cast(0)); + } +}; + +template +class HingeLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* pred = context.Input("Logits"); + auto* label = context.Input("Labels"); + auto* dloss = + context.Input(framework::GradVarName("Loss")); + auto* dpred = + context.Output(framework::GradVarName("Logits")); + auto& place = + *context.template device_context().eigen_device(); + + auto x = framework::EigenVector::Flatten(*pred); + auto y = framework::EigenVector::Flatten(*label); + auto dl = framework::EigenVector::Flatten(*dloss); + + if (dpred) { + dpred->mutable_data(context.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dpred); + auto alt_labels = static_cast(2) * y - static_cast(1); + dx.device(place) = + dl * ((x * alt_labels) < static_cast(1)).template cast() * + (-alt_labels); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dc1f609dcfa23dff82812e72c16b1d62a93ca9a6 --- /dev/null +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/huber_loss_op.h" + +namespace paddle { +namespace operators { + +class HuberLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims, y_dims); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of Input(X) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(x_dims[1], 1, + "Each row of Input(X) contains a real value, " + "so the 2nd dimension of Input(X) must be 1."); + + ctx->SetOutputDim("Residual", x_dims); + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->ShareLoD("X", "Out"); + } +}; + +template +class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input value of huber loss op." + "X is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Y", + "The target value of huber loss op." + "Y is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Residual", + "Intermediate tensor to cache residual value between Y and X." + "The shape is same as Input(X) and will be reused in backward.") + .AsIntermediate(); + AddOutput("Out", + "The output tensor with shape [batch_size, 1] " + "which represents the huber loss."); + AddAttr("delta", "Hyper parameter in huber loss."); + AddComment(R"DOC( +HuberLoss Operator. + +Huber loss is a loss function used in robust regression. We define X as the +input value and Y as the target value. Huber loss can evaluate the fitness of +X to Y. Different from MSE loss, Huber loss is more robust for outliers. The +shape of X and Y are [batch_size, 1]. The equation is: + +$$ +Out_{\delta}(X, Y)_i = +\begin{cases} +0.5 * (Y_i - X_i)^2, +\quad |Y_i - X_i| \leq \delta \\ +\delta * (|Y_i - X_i| - 0.5 * \delta), +\quad otherwise +\end{cases} +$$ + +In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith +element of Out, X and Y. + +)DOC"); + } +}; + +class HuberLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Residual"), + "Input(Residual) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto residual_dims = ctx->GetInputDim("Residual"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(residual_dims, x_dims); + PADDLE_ENFORCE_EQ(out_grad_dims, x_dims); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, + huber_loss_grad, ops::HuberLossGradOp); +REGISTER_OP_CPU_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CPU_KERNEL( + huber_loss_grad, + ops::HuberLossGradKernel); diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ef5120c69d4fda533625ace9bab504be39385ec9 --- /dev/null +++ b/paddle/fluid/operators/huber_loss_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/huber_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CUDA_KERNEL( + huber_loss_grad, + ops::HuberLossGradKernel); diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..caca89fcf63d27c7717de522e38f6ff0cab0d8f6 --- /dev/null +++ b/paddle/fluid/operators/huber_loss_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +struct HuberLossForward { + HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val <= delta) { + return static_cast(0.5) * val * val; + } else { + return delta * (abs_val - static_cast(0.5) * delta); + } + } + + T delta; +}; + +template +class HuberLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* out0 = context.Output("Residual"); + auto* out1 = context.Output("Out"); + auto delta = static_cast(context.Attr("delta")); + auto& place = + *context.template device_context().eigen_device(); + + auto x = EigenVector::Flatten(*in0); + auto y = EigenVector::Flatten(*in1); + out0->mutable_data(context.GetPlace()); + auto residual = EigenVector::Flatten(*out0); + residual.device(place) = y - x; + out1->mutable_data(context.GetPlace()); + auto loss = EigenVector::Flatten(*out1); + loss.device(place) = residual.unaryExpr(HuberLossForward(delta)); + } +}; + +template +struct HuberLossBackward { + HOSTDEVICE HuberLossBackward(const T& delta, T sign) + : sign(sign), delta(delta) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val <= delta) { + return sign * val; + } else { + if (val > 0) { + return sign * delta; + } else { + return -1 * sign * delta; + } + } + } + + T sign; + T delta; +}; + +template +class HuberLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("Residual"); + auto* in1 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + auto* out1 = context.Output(framework::GradVarName("Y")); + auto delta = static_cast(context.op().Attr("delta")); + auto& place = + *context.template device_context().eigen_device(); + + auto residual = EigenVector::Flatten(*in0); + auto out_grad = EigenVector::Flatten(*in1); + + if (out0) { + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenVector::Flatten(*out0); + x_grad.device(place) = + out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); + } + + if (out1) { + out1->mutable_data(context.GetPlace()); + auto y_grad = EigenVector::Flatten(*out1); + y_grad.device(place) = + out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..936e5fe49eda40dff6d8aa5fd626d443ee8dbe75 --- /dev/null +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/im2sequence_op.h" + +namespace paddle { +namespace operators { + +class Im2SequenceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Im2SequenceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Im2SequenceOp op should not be null."); + + auto in_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(in_dim.size(), 4, + "Input(X) format must be 4D tensor, eg., NCHW."); + + auto kernels = ctx->Attrs().Get>("kernels"); + auto strides = ctx->Attrs().Get>("strides"); + auto paddings = ctx->Attrs().Get>("paddings"); + + int batch_size = in_dim[0]; + int img_channels = in_dim[1]; + int img_height = in_dim[2]; + int img_width = in_dim[3]; + + int output_height = OutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = + OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); + + ctx->SetOutputDim("Out", {batch_size * output_height * output_width, + img_channels * kernels[0] * kernels[1]}); + } +}; + +class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor has NCHW format." + "N: batch size" + "C: channels" + "H: height" + "W: width"); + AddOutput("Out", "(LodTensor) The output data of im2sequence op,"); + AddAttr>("kernels", + "(vector), the " + "kernels(kernel_height, kernel_width)"); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride)") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0, 0, 0}), the " + "paddings(up_pad, left_pad, down_pad, right_pad)") + .SetDefault({0, 0, 0, 0}); + AddComment(R"DOC( +This op uses kernels to scan images and converts these images to sequences. +After expanding, The number of time steps are output_height * output_width +and the dimension of each time step is kernel_height * kernel_width * channels, +in which: + +output_height = + 1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) / + stride_height; +output_width = + 1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) / + stride_width; + +This op can be used after convolution neural network, and before recurrent neural network. + +Given: + +x = [[[[ 6. 2. 1.] + [ 8. 3. 5.] + [ 0. 2. 6.]] + + [[ 2. 4. 4.] + [ 6. 3. 0.] + [ 6. 4. 7.]]] + + [[[ 6. 7. 1.] + [ 5. 7. 9.] + [ 2. 4. 8.]] + + [[ 1. 2. 1.] + [ 1. 3. 5.] + [ 9. 0. 8.]]]] +x.dims = {2, 2, 3, 3} + +And: + +kernels = [2, 2] +strides = [1, 1] +paddings = [0, 0, 0, 0] + +Then: + +output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.] + [ 2. 1. 3. 5. 4. 4. 3. 0.] + [ 8. 3. 0. 2. 6. 3. 6. 4.] + [ 3. 5. 2. 6. 3. 0. 4. 7.] + [ 6. 7. 5. 7. 1. 2. 1. 3.] + [ 7. 1. 7. 9. 2. 1. 3. 5.] + [ 5. 7. 2. 4. 1. 3. 9. 0.] + [ 7. 9. 4. 8. 3. 5. 0. 8.]] +output.dims = {8, 9} +output.lod = [[0, 4, 8]] + +)DOC"); + } +}; + +class Im2SequenceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker, + im2sequence_grad, ops::Im2SequenceGradOp); +REGISTER_OP_CPU_KERNEL( + im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CPU_KERNEL( + im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e7bf4631224620ad5c65f750ed0c0c22e936dcf --- /dev/null +++ b/paddle/fluid/operators/im2sequence_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/im2sequence_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CUDA_KERNEL( + im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h new file mode 100644 index 0000000000000000000000000000000000000000..59456f0ea2996bb20bd48806a7258e31518a5ea3 --- /dev/null +++ b/paddle/fluid/operators/im2sequence_op.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +inline int OutputSize(int input_size, int filter_size, int padding_0, + int padding_1, int stride) { + const int output_size = + (input_size + padding_0 + padding_1 - filter_size) / stride + 1; + return output_size; +} + +template +class Im2SequenceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* in = ctx.Input("X"); + LoDTensor* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + // TODO(wanghaoshuang): Add layout checker after 'set_layout' + // being available for python API + // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW, + // "Input(X) layout must be NCHW"); + auto in_dim = in->dims(); + int batch_size = in_dim[0]; + int img_channels = in_dim[1]; + int img_height = in_dim[2]; + int img_width = in_dim[3]; + + auto kernels = ctx.Attr>("kernels"); + auto strides = ctx.Attr>("strides"); + auto paddings = ctx.Attr>("paddings"); + int output_height = OutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = + OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); + + const std::vector dilations({1, 1}); + + auto out_dims = out->dims(); + out->Resize({batch_size, out->numel() / batch_size}); + for (int i = 0; i < batch_size; i++) { + const Tensor src = + in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + Tensor dst = out->Slice(i, i + 1).Resize( + {output_height, output_width, img_channels, kernels[0], kernels[1]}); + + math::Im2ColFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + out->Resize(out_dims); + + // set lod information + // TODO(wanghaoshuang): Move this to InferShape + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + for (int i = 0, offset = 0; i < batch_size + 1; ++i) { + lod[0].push_back(offset); + offset += output_height * output_width; + } + out->set_lod(lod); + } +}; + +template +class Im2SequenceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + Tensor* d_out = + const_cast(ctx.Input(framework::GradVarName("Out"))); + auto* d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); + + auto x_v = framework::EigenVector::Flatten(*d_x); + auto& place = *ctx.template device_context().eigen_device(); + x_v.device(place) = x_v.constant(0.0); + + auto in_dim = in->dims(); + int batch_size = in_dim[0]; + int img_channels = in_dim[1]; + int img_height = in_dim[2]; + int img_width = in_dim[3]; + + auto kernels = ctx.Attr>("kernels"); + auto strides = ctx.Attr>("strides"); + auto paddings = ctx.Attr>("paddings"); + int output_height = OutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = + OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); + + const std::vector dilations({1, 1}); + + auto d_out_dims = d_out->dims(); + d_out->Resize({batch_size, d_out->numel() / batch_size}); + for (int i = 0; i < batch_size; i++) { + Tensor dst = + d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + const Tensor src = d_out->Slice(i, i + 1).Resize( + {output_height, output_width, img_channels, kernels[0], kernels[1]}); + math::Col2ImFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + d_out->Resize(d_out_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/images/batch_norm_fork.dot b/paddle/fluid/operators/images/batch_norm_fork.dot similarity index 100% rename from paddle/operators/images/batch_norm_fork.dot rename to paddle/fluid/operators/images/batch_norm_fork.dot diff --git a/paddle/operators/images/batch_norm_fork.png b/paddle/fluid/operators/images/batch_norm_fork.png similarity index 100% rename from paddle/operators/images/batch_norm_fork.png rename to paddle/fluid/operators/images/batch_norm_fork.png diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/paddle/fluid/operators/images/batch_norm_op_kernel.png similarity index 100% rename from paddle/operators/images/batch_norm_op_kernel.png rename to paddle/fluid/operators/images/batch_norm_op_kernel.png diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d488067b254c37515c6bdb9a4589aad311f344f --- /dev/null +++ b/paddle/fluid/operators/increment_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class IncrementInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IncrementOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of IncrementOp should not be null."); + PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X"))); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } +}; + +struct IncrementFunctor { + IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out, + float value) + : x_(x), out_(out), value_(value) {} + + template + void operator()() const { + *out_->data() = *x_.data() + static_cast(value_); + } + + const framework::LoDTensor &x_; + framework::LoDTensor *out_; + float value_; +}; + +class IncrementOp : public framework::OperatorBase { + public: + IncrementOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + + PADDLE_ENFORCE(platform::is_cpu_place(x.place())); + out.Resize(x.dims()); + out.mutable_data(x.place(), x.type()); + float value = Attr("step"); + VLOG(10) << Output("Out") << " increase " << Input("X") << " with " + << value; + framework::VisitDataType(framework::ToDataType(out.type()), + IncrementFunctor(x, &out, value)); + } +}; + +class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { + public: + IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input tensor of increment operator"); + AddOutput("Out", "(Tensor) The output tensor of increment operator."); + AddAttr("step", + "(float, default 1.0) " + "The step size by which the " + "input tensor will be incremented.") + .SetDefault(1.0); + AddComment(R"DOC( +Increment Operator. + +The equation is: +$$Out = X + step$$ + +)DOC"); + } +}; + +class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("increment"); + grad_op->SetInput("X", Output("Out")); + grad_op->SetOutput("Out", Input("X")); + grad_op->SetAttr("step", -boost::get(GetAttr("step"))); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape, + ops::IncrementOpMaker, ops::IncrementGradOpMaker); diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc new file mode 100755 index 0000000000000000000000000000000000000000..c2e452cdfaa71cae53c2bfe259bae7f80cd259d7 --- /dev/null +++ b/paddle/fluid/operators/iou_similarity_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/iou_similarity_op.h" + +namespace paddle { +namespace operators { + +class IOUSimilarityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IOUSimilarityOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of IOUSimilarityOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2."); + PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2."); + PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]"); + + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]})); + } +}; + +class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, default LoDTensor) " + "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "the shape of X is [N, 4]. [xmin, ymin] is the left top " + "coordinate of the box if the input is image feature map, they " + "are close to the origin of the coordinate system. " + "[xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddInput("Y", + "(Tensor, default Tensor) " + "Box list Y holds M boxes, each box is represented as " + "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. " + "[xmin, ymin] is the left top coordinate of the box if the " + "input is image feature map, and [xmax, ymax] is the right " + "bottom coordinate of the box."); + + AddOutput("Out", + "(LoDTensor, the lod is same as input X) The output of " + "iou_similarity op, a tensor with shape [N, M] " + "representing pairwise iou scores."); + + AddComment(R"DOC( +IOU Similarity Operator. +Computes intersection-over-union (IOU) between two box lists. + Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, + boxes in 'Y' are shared by all instance of the batched inputs of X. + Given two boxes A and B, the calculation of IOU is as follows: + +$$ +IOU(A, B) = +\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)} +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp, + ops::IOUSimilarityOpMaker); + +REGISTER_OP_CPU_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu new file mode 100755 index 0000000000000000000000000000000000000000..f8df1f4aa4c4894b59fe373a9e0cb697dfb96b62 --- /dev/null +++ b/paddle/fluid/operators/iou_similarity_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/iou_similarity_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/iou_similarity_op.h b/paddle/fluid/operators/iou_similarity_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2fb1b5f70703f9c88a532644d88a8b5df45404f0 --- /dev/null +++ b/paddle/fluid/operators/iou_similarity_op.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +template +inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2, + T ymin2, T xmax2, T ymax2) { + constexpr T zero = static_cast(0); + T area1 = (ymax1 - ymin1) * (xmax1 - xmin1); + T area2 = (ymax2 - ymin2) * (xmax2 - xmin2); + T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1; + T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1; + T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2; + T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2; + T inter_height = inter_ymax - inter_ymin; + T inter_width = inter_xmax - inter_xmin; + inter_height = inter_height > zero ? inter_height : zero; + inter_width = inter_width > zero ? inter_width : zero; + T inter_area = inter_width * inter_height; + T union_area = area1 + area2 - inter_area; + T sim_score = inter_area / union_area; + return sim_score; +} + +template +struct IOUSimilarityFunctor { + IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols) + : x_(x), y_(y), z_(z), cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + T x_min1 = x_[row_id * 4]; + T y_min1 = x_[row_id * 4 + 1]; + T x_max1 = x_[row_id * 4 + 2]; + T y_max1 = x_[row_id * 4 + 3]; + for (size_t i = 0; i < cols_; ++i) { + T x_min2 = y_[i * 4]; + T y_min2 = y_[i * 4 + 1]; + T x_max2 = y_[i * 4 + 2]; + T y_max2 = y_[i * 4 + 3]; + + T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2, + x_max2, y_max2); + + z_[row_id * cols_ + i] = sim; + } + } + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +namespace paddle { +namespace operators { + +template +class IOUSimilarityKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::LoDTensor* in_x = ctx.Input("X"); + const framework::Tensor* in_y = ctx.Input("Y"); + framework::LoDTensor* out = ctx.Output("Out"); + + int x_n = in_x->dims()[0]; + int y_n = in_y->dims()[0]; + IOUSimilarityFunctor functor(in_x->data(), in_y->data(), + out->mutable_data(ctx.GetPlace()), y_n); + + platform::ForRange for_range( + static_cast(ctx.device_context()), x_n); + for_range(functor); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea424018d66dac85d5a4ad75cbf5199064d52848 --- /dev/null +++ b/paddle/fluid/operators/is_empty_op.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +constexpr char kInput[] = "X"; +constexpr char kOutput[] = "Out"; + +class IsEmptyOp : public framework::OperatorBase { + public: + IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + // get input + auto *var = scope.FindVar(Input(kInput)); + PADDLE_ENFORCE_NOT_NULL(var); + auto &tensor = var->Get(); + // get output + auto *out = scope.FindVar(Output(kOutput)); + PADDLE_ENFORCE_NOT_NULL(out); + auto *out_tensor = out->GetMutable(); + + out_tensor->Resize({1}); + out_tensor->mutable_data(platform::CPUPlace())[0] = + framework::product(tensor.dims()) == 0; + } +}; + +class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInput, "(Tensor) Tensor which is to be checked."); + AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not."); + AddComment(R"DOC( +IsEmpty Operator which checks whether a tensor is empty. + +It will just return product(tensor.ddims()) > 0; + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp, + paddle::operators::IsEmptyOpProtoMaker); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..974ee404f8364ed66e9e213f857ea89993e1d6af --- /dev/null +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/l1_norm_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class L1NormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + ctx->SetOutputDim("Out", {1}); + } +}; + +class L1NormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +class L1NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input of l1_norm op."); + AddOutput("Out", "(Scalar) The output of l1_norm op."); + AddComment(R"DOC( +L1 Norm Operator. + +Computes the L1 norm of a tensor. + +$$Out = \sum{|X|}$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad, + ops::L1NormGradOp); +REGISTER_OP_CPU_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CPU_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5e9e864a346298a670db63c664491c336a9bd36a --- /dev/null +++ b/paddle/fluid/operators/l1_norm_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/l1_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7ddf2ac6a9046d4d8c2130b459f2385ef4e1301a --- /dev/null +++ b/paddle/fluid/operators/l1_norm_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// Out = sum(abs(X)) +template +class L1NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + framework::Tensor *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto out = framework::EigenScalar::From(*Out); + auto &place = + *context.template device_context().eigen_device(); + + out.device(place) = x.abs().sum(); + } +}; + +// dX = dout * sign(X) +template +class L1NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *x = context.Input("X"); + const framework::Tensor *d_out = + context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar"); + framework::Tensor *dx = + context.Output(framework::GradVarName("X")); + dx->mutable_data(context.GetPlace()); + + auto x_eigen = framework::EigenVector::Flatten(*x); + auto d_out_eigen = framework::EigenVector::Flatten(*d_out); + auto dx_eigen = framework::EigenVector::Flatten(*dx); + auto &place = + *context.template device_context().eigen_device(); + + Eigen::DSizes x_dsize(x->numel()); + dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c018965beefb362ae845d132e34ded1bb2911629 --- /dev/null +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/label_smooth_op.h" + +namespace paddle { +namespace operators { + +class LabelSmoothOp : public framework::OperatorWithKernel { + public: + LabelSmoothOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LabelSmoothOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LabelSmoothOp should not be null."); + auto in_dims = ctx->GetInputDim("X"); + if (ctx->HasInput("PriorDist")) { + auto noise_dims = ctx->GetInputDim("PriorDist"); + auto noise_numel = paddle::framework::product(noise_dims); + PADDLE_ENFORCE( + in_dims[1] == noise_numel, + "The number of elements in Input(PriorDist) must be equal to the " + "dimension of each label."); + } + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", in_dims); + } +}; + +class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) The input labels of LabelSmooth operator. This " + "input can be batched labels in one-hot encoding or output from " + "softmax, with shape [N x K], where N is the batch size and K is " + "the number of classes"); + AddInput("PriorDist", + "(Tensor, optional)" + "The prior distribution to be added to the smoothed label. It is " + "fixed during training and the number of elements should be equal " + "to the dimension K of each label. Default is uniform " + "distribution and each element will be set to 1/K if not provided " + "in input.") + .AsDispensable(); + AddOutput("Out", + "(loDTensor) The smoothed label of LabelSmooth operator. It has" + "the same shape and LoD with the Input(LoDTensor)."); + AddAttr("epsilon", + "(float, default 0.0f)" + "The smoothing parameter of LabelSmooth operator.") + .SetDefault(0.0f); + AddComment(R"DOC( +LabelSmooth Operator. + +Label smoothing is a mechanism to regularize the classifier layer. In machine +learning, optimizing the log-likelihood of the correct label directly may +cause two problems. First, it may result in overfitting: if the model learns +to assign full probability to the ground-truth label for each training example, +it is not guaranteed to generalize. Second, it encourages the differences +between the largest logit and all others to become large, reducing the ability +of the model to adapt. Label smoothing is proposed to encourage the model to +be less confident, which replaces the ground-truth label $y$ with the weighted +sum of itself and some fixed distribution $\mu$, i.e. + +$$ + \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu, +$$ + +where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and +$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for +$\mu$. This change in the ground-truth label is called label-smoothing +regularization or LSR. + +See more details about label smoothing in https://arxiv.org/abs/1512.00567. + +)DOC"); + } +}; + +class LabelSmoothGradOp : public framework::OperatorWithKernel { + public: + LabelSmoothGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, + label_smooth_grad, ops::LabelSmoothGradOp); +REGISTER_OP_CPU_KERNEL( + label_smooth, + ops::LabelSmoothKernel, + ops::LabelSmoothKernel); +REGISTER_OP_CPU_KERNEL( + label_smooth_grad, + ops::LabelSmoothGradKernel, + ops::LabelSmoothGradKernel); diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4a40a4e9ec82199afae3ae77bb2296a2fa95b0a5 --- /dev/null +++ b/paddle/fluid/operators/label_smooth_op.cu @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/label_smooth_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + label_smooth, + ops::LabelSmoothKernel, + ops::LabelSmoothKernel); +REGISTER_OP_CUDA_KERNEL( + label_smooth_grad, + ops::LabelSmoothGradKernel, + ops::LabelSmoothGradKernel); diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15752377f663fcb526b8306158e1e90d743c6cb6 --- /dev/null +++ b/paddle/fluid/operators/label_smooth_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LabelSmoothKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* in_t = ctx.Input("X"); + auto* dist_t = ctx.Input("PriorDist"); + auto label_dim = in_t->dims()[1]; + out_t->mutable_data(ctx.GetPlace()); + + auto epsilon = ctx.Attr("epsilon"); + auto out = framework::EigenVector::Flatten(*out_t); + auto in = framework::EigenVector::Flatten(*in_t); + auto& dev = *ctx.template device_context().eigen_device(); + if (dist_t) { + auto dist = framework::EigenVector::Flatten(*dist_t); + out.device(dev) = + static_cast(1 - epsilon) * in + + epsilon * dist.broadcast(Eigen::DSizes(in_t->numel())); + } else { + out.device(dev) = static_cast(1 - epsilon) * in + + static_cast(epsilon / label_dim); + } + } +}; + +template +class LabelSmoothGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* d_in_t = ctx.Output(framework::GradVarName("X")); + d_in_t->mutable_data(ctx.GetPlace()); + + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto d_in = framework::EigenVector::Flatten(*d_in_t); + + auto epsilon = ctx.Attr("epsilon"); + auto& dev = *ctx.template device_context().eigen_device(); + d_in.device(dev) = static_cast(1 - epsilon) * d_out; + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..60e37ed01b3cad428dc0184634b4d36c9f24f9c5 --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/layer_norm_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +class LayerNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Mean"), + "Output(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Variance"), + "Output(Variance) of LayerNormOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); + PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), + "'begin_norm_axis' must be less than the rank of X."); + + auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + if (ctx->HasInput("Scale")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); + } + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); + } + + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); + ctx->SetOutputDim("Mean", {left}); + ctx->SetOutputDim("Variance", {left}); + ctx->ShareLoD("X", "Y"); + } +}; + +class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The input tensor."); + AddInput("Scale", + "(Tensor, optional) Scale is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddInput("Bias", + "(Tensor, optional) Bias is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddOutput("Y", "(LoDTensor) Result after normalization."); + AddOutput("Mean", "(Tensor) Mean of the current mini batch.") + .AsIntermediate(); + AddOutput("Variance", "(Tensor) Variance of the current mini batch.") + .AsIntermediate(); + + AddAttr("epsilon", + "(float, default 1e-5) Constant for " + "numerical stability") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("begin_norm_axis", + "(int default:1), the " + "axis of `begin_norm_axis ... Rank(X) - 1` will be " + "normalized. `begin_norm_axis` splits the tensor(`X`) to a " + "matrix [N,H].") + .SetDefault(1) + .AddCustomChecker([](const int &begin_norm_axis) { + PADDLE_ENFORCE_GT(begin_norm_axis, 0, + "'begin_norm_axis' should be greater than zero."); + }); + + AddComment(R"DOC( +Layer Normalization. +Layer Norm has been implemented as discussed in the paper: +https://arxiv.org/abs/1607.06450 +... +)DOC"); + } +}; + +class LayerNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) of LayerNormOp should not be null."); + + // check output + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); + } + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + return framework::OpKernelType(framework::ToDataType(t->type()), + ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, + layer_norm_grad, ops::LayerNormGradOp); +REGISTER_OP_CPU_KERNEL( + layer_norm, ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CPU_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..aa54fd54155ce19298ad9f80c930ad08e542d71c --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/layer_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..60c0b07add172520dc8062d3d5e8e4e69758e1f1 --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op.h @@ -0,0 +1,238 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct SubAndSquareFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } +}; + +template +struct DivAndSqrtFunctor { + explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } + inline HOSTDEVICE T operator()(T a, T b) const { + return a / (sqrt(b + epsilon_)); + } + + private: + T epsilon_; +}; + +template +struct MulFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } +}; + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +struct MulInvVarFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { + return a * std::sqrt(1.0 / b); + } +}; + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +class LayerNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto x = *ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x.dims(); + + y->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + x.Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + auto &dev_ctx = ctx.template device_context(); + math::RowwiseMean row_mean; + + // get mean + row_mean(dev_ctx, x, mean); + + // get variance + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); + row_mean(dev_ctx, out, var); + + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &out); + + if (scale) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); + } + if (bias) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); + } + } +}; + +template +class LayerNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto x = *ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto d_y = *ctx.Input(framework::GradVarName("Y")); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + const auto &x_dims = x.dims(); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + d_y.Resize(matrix_shape); + auto &dev_ctx = ctx.template device_context(); + math::ColwiseSum colwise_sum; + + Tensor temp; + Tensor temp_norm; + if (d_scale || d_x) { + x.Resize(matrix_shape); + temp.mutable_data(matrix_shape, ctx.GetPlace()); + + if (!(bias && scale)) { + temp_norm.ShareDataWith(*y); + temp_norm.Resize(matrix_shape); + } else { + temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); + } + } + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + colwise_sum(dev_ctx, d_y, d_bias); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); + colwise_sum(dev_ctx, temp, d_scale); + } + + if (d_x) { + framework::DDim vec_shape({left}); + d_x->mutable_data(ctx.GetPlace()); + auto dx_dim = d_x->dims(); + Tensor temp_vec; + temp_vec.mutable_data(vec_shape, ctx.GetPlace()); + + math::RowwiseMean row_mean; + + if (d_scale) { + // dy_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); + framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } else { + // dy_dx + framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, d_y, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } + // dy_var_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); + + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), d_x); + d_x->Resize(dx_dim); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e1dfa494872b6f187ee0b3cca399308a1cab42a --- /dev/null +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -0,0 +1,269 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/linear_chain_crf_op.h" + +namespace paddle { +namespace operators { + +class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Emission", + "(LoDTensor, default LoDTensor) " + "A 2-D LoDTensor with shape [N x D], where N is the size of the " + "mini-batch and D is the total tag number. The unscaled emission " + "weight matrix for the linear chain CRF. "); + AddInput("Transition", + "(Tensor, default Tensor) A 2-D Tensor with shape " + "[(D + 2) x D]. The learnable parameter for the linear_chain_crf " + "operator. See more details in the operator's comments."); + AddInput("Label", + "(LoDTensor, default LoDTensor) A LoDTensor with shape " + "[N x 1], where N is the total element number in a mini-batch. " + "The ground truth."); + AddOutput( + "Alpha", + "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "The forward vectors for the entire batch. Denote it as $\alpha$. " + "$\alpha$ is a memo table used to calculate the normalization " + "factor in CRF. $\alpha[k, v]$ stores the unnormalized " + "probabilites of all possible unfinished sequences of tags that end at " + "position $k$ with tag $v$. For each $k$, " + "$\alpha[k, v]$ is a vector of length $D$ with a component for " + "each tag value $v$. This vector is called a forward vecotr and " + "will also be used in backward computations.") + .AsIntermediate(); + AddOutput( + "EmissionExps", + "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "The exponentials of Input(Emission). This is an intermediate " + "computational result in forward computation, and will be reused in " + "backward computation.") + .AsIntermediate(); + AddOutput( + "TransitionExps", + "(Tensor, default Tensor) A 2-D Tensor with shape " + "[(D + 2) x D]. The exponentials of Input(Transition). This is an " + "intermediate computational result in forward computation, and " + "will be reused in backward computation.") + .AsIntermediate(); + AddOutput( + "LogLikelihood", + "(Tensor, default Tensor) The logarithm of the conditional " + "likelihood of each training sample in a mini-batch. This is a 2-D " + "tensor with shape [S x 1], where S is the sequence number in a " + "mini-batch. Note: S is equal to the sequence number in a mini-batch. " + "The output is no longer a LoDTensor."); + AddComment(R"DOC( +LinearChainCRF Operator. + +Conditional Random Field defines an undirected probabilistic graph with nodes +denoting random variables and edges denoting dependencies between these +variables. CRF learns the conditional probability $P(Y|X)$, where +$X = (x_1, x_2, ... , x_n)$ are structured inputs and +$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs. + +Linear chain CRF is a special case of CRF that is useful for sequence labeling +task. Sequence labeling tasks do not assume a lot of conditional +independences among inputs. The only constraint they impose is that the input +and output must be linear sequences. Thus, the graph of such a CRF is a simple +chain or a line, which results in the linear chain CRF. + +This operator implements the Forward-Backward algorithm for the linear chain +CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and +http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details. + +Equation: +1. Denote Input(Emission) to this operator as $x$ here. +2. The first D values of Input(Transition) to this operator are for starting +weights, denoted as $a$ here. +3. The next D values of Input(Transition) of this operator are for ending +weights, denoted as $b$ here. +4. The remaning values of Input(Transition) are for transition weights, +denoted as $w$ here. +5. Denote Input(Label) as $s$ here. + +The probability of a sequence $s$ of length $L$ is defined as: +$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L} + + \sum_{l=1}^L x_{s_l} + + \sum_{l=2}^L w_{s_{l-1},s_l})$$ + +where $Z$ is a normalization value so that the sum of $P(s)$ over +all possible sequences is 1, and $x$ is the emission feature weight +to the linear chain CRF. + +Finally, the linear chain CRF operator outputs the logarithm of the conditional +likelihood of each training sample in a mini-batch. + +NOTE: +1. The feature function for a CRF is made up of the emission features and the +transition features. The emission feature weights are NOT computed in +this operator. They MUST be computed first before this operator is called. + +2. Because this operator performs global normalization over all possible +sequences internally, it expects UNSCALED emission feature weights. +Please do not call this op with the emission feature being output of any +nonlinear activation. + +3. The 2nd dimension of Input(Emission) MUST be equal to the tag number. + +)DOC"); + } +}; + +class LinearChainCRFOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Emission"), + "Input(Emission) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Transition"), + "Input(Transition) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Alpha"), + "Output(Alpha) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("EmissionExps"), + "Output(EmissionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("TransitionExps"), + "Output(TransitionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"), + "Output(LogLikelihood) should be not null."); + + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, + "The Input(Transition) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_dims[0] - 2, transition_dims[1], + "An invalid dimension for the Input(Transition), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + PADDLE_ENFORCE_EQ( + emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + + ctx->SetOutputDim("Alpha", emission_dims); + ctx->SetOutputDim("EmissionExps", emission_dims); + ctx->SetOutputDim("TransitionExps", transition_dims); + // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood) + // is the sequence number in a mini-batch. The dimension set here should be + // resized to its correct size in the function Compute. Fix this once we can + // get LoD information in the InferShape interface. + ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1}); + } + + protected: + // Explicitly set that the data type of computation kernel of linear_chain_crf + // is determined by its input "Emission". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Emission")->type()), + platform::CPUPlace()); + } +}; + +class LinearChainCRFGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("EmissionExps"), + "Input(EmissionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("TransitionExps"), + "Input(TransitionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")), + "Input(LogLikelihood@GRAD) shoudl be not null."); + + auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL, + "The Input(EmissionExps) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_exps_dims[0], + "An empty mini-batch is not allowed."); + + auto transition_exps_dims = ctx->GetInputDim("TransitionExps"); + PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL, + "The Input(TransitionExps) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_exps_dims[0] - 2, transition_exps_dims[1], + "An invalid dimension for the Input(TransitionExps), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + PADDLE_ENFORCE_EQ( + emission_exps_dims[1], transition_exps_dims[1], + "The 2nd dimension of the Input(EmissionExps) and the " + "Input(TransitionExps) should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_exps_dims[0], label_dims[0], + "The height of Input(EmissionExps) and the height of Input(Label) " + "should be the same."); + + if (ctx->HasOutput(framework::GradVarName("Emission"))) { + ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); + } + if (ctx->HasOutput(framework::GradVarName("Transition"))) { + ctx->SetOutputDim(framework::GradVarName("Transition"), + transition_exps_dims); + } + } + + protected: + // Explicitly set that the data type of output of the linear_chain_crf_grad + // operator is determined by its input: gradients of LogLikelihood. + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("LogLikelihood")) + ->type()), + platform::CPUPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker, + linear_chain_crf_grad, ops::LinearChainCRFGradOp); +REGISTER_OP_CPU_KERNEL( + linear_chain_crf, + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CPU_KERNEL( + linear_chain_crf_grad, + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cu b/paddle/fluid/operators/linear_chain_crf_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e04e76eebc71def814fed65a469ef5f9f1b16b0 --- /dev/null +++ b/paddle/fluid/operators/linear_chain_crf_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/linear_chain_crf_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + linear_chain_crf, + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CUDA_KERNEL( + linear_chain_crf_grad, + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15b64c09bf366b356683c47c82a6dbf9529d9b58 --- /dev/null +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -0,0 +1,353 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +static inline T NormalizeL1(T* x, size_t len) { + T sum = 0.; + for (size_t i = 0; i < len; ++i) sum += x[i]; + // (This comment is from the old LinearChainCRFLayer.) + // Right now, we just bet that sum won't be zero. If this really happens, we + // will figure out what should be done then. + PADDLE_ENFORCE(sum, + "The unnormalized probabilities of all possible unfinished " + "sequences must be greater than 0."); + T s = 1. / sum; + for (size_t i = 0; i < len; ++i) x[i] *= s; + return sum; +} + +template +struct ScalarMul { + explicit ScalarMul(const T& scalar) : scalar(scalar) {} + T operator()(const T& val) const { return val * scalar; } + + T scalar; +}; + +using framework::LoDTensor; +using framework::LoD; +using framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class LinearChainCRFOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(caoying) The checks related to LoD information should be + // moved into InferShape once after the InferShape is refactored. + PADDLE_ENFORCE_EQ(ctx.Input("Emission")->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + PADDLE_ENFORCE_EQ(ctx.Input("Label")->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + auto in_lod = ctx.Input("Label")->lod(); + PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence."); + const size_t level = 0; + const size_t seq_num = in_lod[level].size() - 1; + + const LoDTensor* emission_weights = ctx.Input("Emission"); + const Tensor* transition_weights = ctx.Input("Transition"); + const LoDTensor* label = ctx.Input("Label"); + + Tensor* emission_exps = ctx.Output("EmissionExps"); + Tensor* transition_exps = ctx.Output("TransitionExps"); + Tensor* alpha = ctx.Output("Alpha"); + Tensor* ll = ctx.Output("LogLikelihood"); + + // Because the computation codes only runs on CPU, here the memory for all + // the outputs is FIXED to be allocated on the CPU memory. + emission_exps->mutable_data(platform::CPUPlace()); + transition_exps->mutable_data(platform::CPUPlace()); + alpha->mutable_data(platform::CPUPlace()); + + // Resize the output tensor to its correct dimension. + ll->Resize({static_cast(seq_num), 1}); + ll->mutable_data(platform::CPUPlace()); + + // Now, all the inputs and outputs should be on the CPU memory. + auto emission_dims = emission_weights->dims(); + const size_t batch_size = emission_dims[0]; + const size_t tag_num = emission_dims[1]; + + Tensor emission_row_max; + emission_row_max.mutable_data( + framework::make_ddim({static_cast(batch_size), 1}), + platform::CPUPlace()); + + auto& place = *ctx.template device_context() + .eigen_device(); + auto x = EigenMatrix::From(*emission_weights); + auto x_row_max = EigenMatrix::From(emission_row_max); + x_row_max.device(place) = + x.maximum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(int(batch_size), 1)); + + auto x_exps = EigenMatrix::From(*emission_exps); + x_exps.device(place) = + (x - x_row_max.broadcast(Eigen::DSizes(1, tag_num))).exp(); + + auto w = EigenMatrix::From(*transition_weights); + auto w_exps = EigenMatrix::From(*transition_exps); + w_exps.device(place) = w.exp(); + + T* log_likelihood = ll->data(); + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = static_cast(in_lod[level][i]); + int end_pos = static_cast(in_lod[level][i + 1]); + if (end_pos == start_pos) { + // If an empty input sequence is given, pad 0 for its cost. + log_likelihood[i] = 0.; + continue; + } + + const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); + Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); + Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + + log_likelihood[i] = ForwardOneSequence( + one_seq, one_seq_row_max, one_seq_exps, *transition_weights, + *transition_exps, one_seq_label, &one_seq_alpha); + } + }; + + private: + T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max, + const Tensor& emission_exps, const Tensor& trans_weights, + const Tensor& trans_weight_exps, const Tensor& label, + Tensor* alpha) const { + const T* x = emission.data(); + const T* x_row_max = emission_row_max.data(); + const T* x_exps = emission_exps.data(); + const T* w = trans_weights.data(); + const T* w_exps = trans_weight_exps.data(); + T* alpha_value = alpha->data(); + + auto x_dims = emission.dims(); + const size_t seq_length = x_dims[0]; + const size_t tag_num = x_dims[1]; + // The 1st row of w are transition weights for start mask. + // The 2nd row of w are transition weights for end mask. + // Transition weights between other tags begin from the 3rd row of w. + const size_t state_trans_base_idx = 2; + + for (size_t i = 0; i < tag_num; ++i) { + alpha_value[i] = w_exps[i] * x_exps[i]; + } + T ll = -x_row_max[0] - std::log(NormalizeL1(alpha_value, tag_num)); + + for (size_t k = 1; k < seq_length; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T sum = 0.; + for (size_t j = 0; j < tag_num; ++j) { + sum += alpha_value[(k - 1) * tag_num + j] * // (*) + w_exps[(j + state_trans_base_idx) * tag_num + i]; + } + alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum; + } + // NormalizeL1 is to avoid underflow or overflow at (*). + ll -= x_row_max[k] + + std::log(NormalizeL1(alpha_value + k * tag_num, tag_num)); + } + T sum = 0.; + for (size_t i = 0; i < tag_num; ++i) { + sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i]; + } + ll -= std::log(sum); + // Now ll is equal to -log(Z). + + const int64_t* lbl = label.data(); + PADDLE_ENFORCE_LT( + static_cast(*std::max_element(lbl, lbl + seq_length)), tag_num, + "An invalid tag label that execesses the largest tag number."); + + // Calculate the nominator part, which depends on the label sequence. + ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] + + w[tag_num + lbl[seq_length - 1]] /*end transition*/; + for (size_t k = 1; k < seq_length; ++k) { + ll += x[k * tag_num + lbl[k]] + + w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]]; + } + return -ll; + } +}; + +template +class LinearChainCRFGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const size_t level = 0; // currently, only support sequence. + auto lod = ctx.Input("Label")->lod(); + PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence."); + + const Tensor* label = ctx.Input("Label"); + const Tensor* emission_exps = ctx.Input("EmissionExps"); + const Tensor* transition_exps = ctx.Input("TransitionExps"); + const Tensor* alpha = ctx.Input("Alpha"); + const T* ll_grad = + ctx.Input(framework::GradVarName("LogLikelihood"))->data(); + + Tensor* emission_grad = + ctx.Output(framework::GradVarName("Emission")); + Tensor* transition_grad = + ctx.Output(framework::GradVarName("Transition")); + + // TODO(caoying) Fix this constraint. When the Input(Emission) is from the + // data reader operator, it can have no gradients. + PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null."); + emission_grad->mutable_data(platform::CPUPlace()); + if (transition_grad) { + transition_grad->mutable_data(platform::CPUPlace()); + math::set_constant(ctx.device_context(), transition_grad, 0.); + } + // Now, all the inputs and outputs should be on the CPU memory. + + auto emission_dims = emission_exps->dims(); + // Beta is the memo table used in dynamic programming to calculate the + // backwark vectors. For a backward vector i (the i-th row of beta), it + // captures the unnormalized probabilities of partial sequences starting + // at position i. + Tensor beta; + beta.mutable_data(emission_dims, platform::CPUPlace()); + + for (size_t i = 0; i < lod[level].size() - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + if (end_pos == start_pos) continue; + + const Tensor one_seq_emission_exps = + emission_exps->Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + Tensor one_seq_beta = beta.Slice(start_pos, end_pos); + Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); + + BackwardOneSequence( + ctx.template device_context(), ll_grad[i], + one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, + &one_seq_beta, transition_grad, &one_seq_emission_grad); + } + }; + + private: + void BackwardOneSequence(const platform::CPUDeviceContext& ctx, + const T ll_grad, const Tensor& emission_exps, + const Tensor& transition_exps, const Tensor& alpha, + const Tensor& label, Tensor* beta, + Tensor* transition_grad, + Tensor* emission_grad) const { + const T* w_exps = transition_exps.data(); + const T* x_exps = emission_exps.data(); + const int64_t* label_value = label.data(); + T* beta_value = beta->data(); + + auto x_dims = emission_exps.dims(); + const size_t seq_length = x_dims[0]; + const size_t tag_num = x_dims[1]; + const size_t state_trans_base_idx = 2; + + // Calculate the backward vectors: beta. + // First, calculate the initialition state. + for (size_t i = 0; i < tag_num; ++i) { + beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i]; + } + NormalizeL1(beta_value + (seq_length - 1) * tag_num, tag_num); + for (int k = static_cast(seq_length) - 2; k >= 0; --k) { + for (size_t i = 0; i < tag_num; ++i) { + T sum = 0.; + for (size_t j = 0; j < tag_num; ++j) { + sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * // (**) + x_exps[(k + 1) * tag_num + j] * + beta_value[(k + 1) * tag_num + j]; + } + beta_value[k * tag_num + i] = sum; + } + // NormalizeL1 is to avoid underflow or overflow at (**). + NormalizeL1(beta_value + k * tag_num, tag_num); + } + + auto x_grad_mat = EigenMatrix::From(*emission_grad); + auto alpha_mat = EigenMatrix::From(alpha); + auto beta_mat = EigenMatrix::From(*beta); + + auto* place = ctx.eigen_device(); + auto prob = alpha_mat * beta_mat; + auto row_sum = prob.sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(seq_length, 1)) + .broadcast(Eigen::DSizes(1, tag_num)); + x_grad_mat.device(*place) = + (prob / row_sum).unaryExpr(ScalarMul(ll_grad)); + + for (size_t k = 0; k < seq_length; ++k) { + x_grad_mat(k, label_value[k]) -= static_cast(ll_grad); + } + + if (transition_grad) { + T* trans_grad = transition_grad->data(); + for (size_t k = 0; k < tag_num; ++k) { + // Do not multiply by the output gradient here, because x_grad_mat has + // alrealy done this. + trans_grad[k] += x_grad_mat(/*from start state*/ 0, k); + trans_grad[tag_num + k] += + x_grad_mat(/*to end state*/ seq_length - 1, k); + } + + auto x_exps_mat = EigenMatrix::From(emission_exps); + + // TODO(caoying): Fix this to avoid using this local variable if we can + // profile the training process. + Tensor tmp; + tmp.mutable_data(beta->dims(), platform::CPUPlace()); + auto tmp_mat = EigenMatrix::From(tmp); + auto prob = beta_mat * x_exps_mat; + auto row_sum = prob.sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(seq_length, 1)) + .broadcast(Eigen::DSizes(1, tag_num)); + tmp_mat.device(*place) = prob / row_sum; + + for (size_t k = 1; k < seq_length; ++k) { + T sum = 0.; + for (size_t i = 0; i < tag_num; ++i) { + for (size_t j = 0; j < tag_num; ++j) { + sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * // (**) + alpha_mat(k - 1, i) * tmp_mat(k, j); + } + } + sum = 1. / sum; + for (size_t i = 0; i < tag_num; ++i) { + for (size_t j = 0; j < tag_num; ++j) { + trans_grad[(i + state_trans_base_idx) * tag_num + j] += + sum * w_exps[(i + state_trans_base_idx) * tag_num + j] * + alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad; + } + } + trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num + + label_value[k]] -= static_cast(ll_grad); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a72708d9baad98b670b46da66f56bf79bad951be --- /dev/null +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" +#include "paddle/string/printf.h" + +namespace paddle { +namespace operators { + +constexpr char kOptimizeBlock[] = "OptimizeBlock"; + +void RunServer(std::shared_ptr service) { + service->RunSyncUpdate(); + VLOG(4) << "RunServer thread end"; +} + +static void CreateTensorFromMessageType(framework::Variable *var, + sendrecv::VarType var_type) { + if (var_type == sendrecv::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == sendrecv::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else { + PADDLE_THROW( + "VariableMessage type %d is not in " + "[LoDTensor, SelectedRows]", + var_type); + } +} + +class ListenAndServOp : public framework::OperatorBase { + public: + ListenAndServOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) { + if (!rpc_service_) { + std::string endpoint = Attr("endpoint"); + rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); + server_thread_.reset(new std::thread(RunServer, rpc_service_)); + } + } + + void Stop() override { + detail::MessageWithName term_msg; + term_msg.first = LISTEN_TERMINATE_MESSAGE; + rpc_service_->Push(term_msg); + rpc_service_->ShutDown(); + server_thread_->join(); + } + + std::string GetGradVarNameForTrainer(const std::string &varname) const { + if (grads_counter_.find(varname) == grads_counter_.end()) { + grads_counter_[varname] = 0; + } + return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++); + } + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Scope &recv_scope = scope.NewScope(); + + // FIXME(Yancey1989): initialize rpc server with lazy mode. + rpc_service_->SetScope(&recv_scope); + rpc_service_->SetDevCtx(&dev_ctx); + auto param_list = Attr>("ParamList"); + auto grad_list = Attr>("GradList"); + auto fan_in = Attr("Fanin"); + + auto *block = Attr(kOptimizeBlock); + auto *program = block->Program(); + framework::Executor executor(dev_place); + + // TODO(typhoonzero): change this to a while_op for every cluster-batch. + bool exit_flag = false; + while (!exit_flag) { + // Get from multiple trainers, we don't care about the order in which + // the gradients arrives, just add suffix 0~n and merge the gradient. + rpc_service_->SetCond(0); + size_t recv_var_cnt = 0; + int batch_barrier = 0; + while (batch_barrier != fan_in) { + const detail::MessageWithName &v = rpc_service_->Get(); + auto grad_var_name = v.first; + if (grad_var_name == LISTEN_TERMINATE_MESSAGE) { + LOG(INFO) << "received terminate message and exit"; + exit_flag = true; + break; + } else if (grad_var_name == BATCH_BARRIER_MESSAGE) { + VLOG(3) << "recv batch barrier message"; + batch_barrier++; + continue; + } else { + // receive a variable + recv_var_cnt++; + auto it = + std::find(grad_list.begin(), grad_list.end(), grad_var_name); + std::string param_var_name; + if (it != grad_list.end()) { + param_var_name = param_list[it - grad_list.begin()]; + } else { + LOG(ERROR) << "grad has no paired param:" << grad_var_name; + } + VLOG(3) << "received grad: " << grad_var_name + << " updating param: " << param_var_name; + + if (fan_in > 1) { + grad_var_name = this->GetGradVarNameForTrainer(grad_var_name); + } + auto *var = recv_scope.FindVar(grad_var_name); + if (var == nullptr) { + LOG(ERROR) << "Can not find server side var: " << grad_var_name; + PADDLE_THROW("Can not find server side var"); + } + detail::DeserializeFromMessage(v.second, dev_ctx, var); + } + } + VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier."; + // TODO(Yancey1989): merge SelectedRows variables here + if (exit_flag) { + rpc_service_->ShutDown(); + } + + try { + executor.Run(*program, &recv_scope, block->ID(), /*global_block*/ + false /*create_local_scope*/, false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + rpc_service_->SetCond(1); + rpc_service_->WaitClientGet(recv_var_cnt); + grads_counter_.clear(); + } // while(true) + } + + protected: + std::shared_ptr rpc_service_; + std::shared_ptr server_thread_; + mutable std::unordered_map grads_counter_; +}; + +class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC( +ListenAndServ operator + +This operator will start a RPC server which can receive variables +from send_op and send back variables to recv_op. +)DOC"); + AddAttr("endpoint", + "(string, default 127.0.0.1:6164)" + "IP address to listen on.") + .SetDefault("127.0.0.1:6164") + .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr(kOptimizeBlock, + "BlockID to run on server side."); + AddAttr>( + "ParamList", "type list of string", + "grad->param name mapping to find which parameters to optimize.") + .SetDefault({}); + AddAttr>( + "GradList", "type list of string", + "grad->param name mapping to find which parameters to optimize.") + .SetDefault({}); + AddAttr("Fanin", "type int", + "Number of trainers in the current cluster job") + .SetDefault(1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp, + ops::ListenAndServOpMaker); diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1948063d886b79964b1a52d9d82a8e7d2fb0d493 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class LoadCombineOp : public framework::OperatorBase { + public: + LoadCombineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + + auto out_var_names = Outputs("Out"); + PADDLE_ENFORCE_GT( + static_cast(out_var_names.size()), 0, + "The number of output variables should be greater than 0."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < out_var_names.size(); i++) { + auto *out_var = scope.FindVar(out_var_names[i]); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", + out_var_names[i]); + + auto *tensor = out_var->GetMutable(); + + // Error checking + PADDLE_ENFORCE(static_cast(fin), "Cannot read more from file %s", + filename); + + // Get data from fin to tensor + DeserializeFromStream(fin, tensor, dev_ctx); + + if (platform::is_gpu_place(place)) { + // copy CPU to GPU + framework::LoDTensor cpu_tensor; + cpu_tensor.ShareDataWith(*tensor); + cpu_tensor.set_lod(tensor->lod()); + + // reset tensor + out_var->Clear(); + tensor = out_var->GetMutable(); + tensor->set_lod(cpu_tensor.lod()); + Copy(cpu_tensor, place, dev_ctx, tensor); + } + } + } +}; + +class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput( + "Out", + "(vector) The output LoDTensors that will be read from the input file.") + .AsDuplicable(); + AddAttr("file_path", + "(string) " + "LoDTensors will be loaded from \"file_path\".") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + AddComment(R"DOC( +LoadCombine Operator. + +LoadCombine operator loads LoDTensor variables from a file. The file should +contain one or more LoDTensors serialized using the SaveCombine operator. The +LoadCombine operator applies a deserialization strategy to appropriately load +the LodTensors, and this strategy complements the serialization strategy used +in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled +with the SaveCombine operator, and can only deserialize one or more LoDTensors +that were saved using the SaveCombine operator. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(load_combine, ops::LoadCombineOp, + ops::LoadCombineOpProtoMaker); diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9bf5d72b234f96d9eb5a4c275737ac8c18cd63d --- /dev/null +++ b/paddle/fluid/operators/load_op.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class LoadOp : public framework::OperatorBase { + public: + LoadOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + filename); + + auto out_var_name = Output("Out"); + auto *out_var = scope.FindVar(out_var_name); + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", + out_var_name); + + auto *tensor = out_var->GetMutable(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + DeserializeFromStream(fin, tensor, dev_ctx); + + if (platform::is_gpu_place(place)) { + // copy CPU to GPU + framework::LoDTensor cpu_tensor; + cpu_tensor.ShareDataWith(*tensor); + cpu_tensor.set_lod(tensor->lod()); + + // reset tensor + out_var->Clear(); + tensor = out_var->GetMutable(); + tensor->set_lod(cpu_tensor.lod()); + Copy(cpu_tensor, place, dev_ctx, tensor); + } + } +}; + +class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddAttr("file_path", + "(string) " + "Variable will be loaded from \"file_path\".") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + AddComment(R"DOC( +Load Operator. + +Load operator will load a tensor variable from disk file. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f11f5a89f5ad5b2f3deed905625aefa1e9d9935b --- /dev/null +++ b/paddle/fluid/operators/lod_array_length_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class LoDArrayLengthOp : public framework::OperatorBase { + public: + LoDArrayLengthOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + out.Resize({1}); + auto cpu = platform::CPUPlace(); + *out.mutable_data(cpu) = static_cast(x.size()); + } +}; + +class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensorArray) The input tensor array."); + AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t"); + AddComment(R"DOC( +LoDArrayLength Operator. + +This operator obtains the length of lod tensor array: + +$$Out = len(X)$$ + +NOTE: The output is a CPU Tensor since the control variable should be only in +CPU and the length of LoDTensorArray should be used as control variables. + +)DOC"); + } +}; + +class LoDArrayLengthInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X")); + PADDLE_ENFORCE(context->HasOutput("Out")); + context->SetOutputDim("Out", {1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp, + ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b9426a9f8f0b0b3082667dc7a1414aceb824aca --- /dev/null +++ b/paddle/fluid/operators/lod_rank_table_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/op_registry.h" +namespace paddle { +namespace operators { + +class LoDRankTableOp : public framework::OperatorBase { + public: + LoDRankTableOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto x = scope.FindVar(Input("X"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + VLOG(10) << "Level = " << static_cast(Attr("level")); + out->Reset(x.lod(), static_cast(Attr("level"))); + VLOG(10) << Input("X") << "'s lod information is " << *out; + } +}; + +class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) input lod tensor, must contain lod information."); + AddOutput("Out", "(LoDRankTable) The rank table of specific level."); + AddAttr("level", "(int) the specific lod level to rank.") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment(R"DOC(Create LoDRanTable by LoDTensor + +LoD Rank Table stores the `level` of `lod` which is ordered by sequence +length in descending order. It is useful when implement dynamic RNN and is +shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice +output operators. +)DOC"); + } +}; + +class LoDRankTableInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "LoDRankTable must has input X"); + } +}; + +class LoDRankTableInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o : op_desc.Output("Out")) { + block->FindRecursiveOrCreateVar(o).SetType( + framework::proto::VarDesc::LOD_RANK_TABLE); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(lod_rank_table, paddle::operators::LoDRankTableOp, + paddle::operators::LoDRankTableOpProtoMaker, + paddle::operators::LoDRankTableInferShape, + paddle::operators::LoDRankTableInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..55ae71c1815470925b2bb153fc647b331dcc9ba4 --- /dev/null +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lod_reset_op.h" + +namespace paddle { +namespace operators { + +class LoDResetOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LoDResetOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LoDResetOp should not be null."); + // If target LoD is not set form Input(), then it must be set from Attr(). + if (!ctx->HasInput("TargetLoD")) { + auto level0 = ctx->Attrs().Get>("target_lod"); + PADDLE_ENFORCE(level0.size() > 1, + "Target LoD is not found, should be set to be a valid one " + "through Input() or Attr()."); + } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The input tensor of lod_reset operator."); + AddInput("TargetLoD", + "(Tensor, optional) The target level 0 LoD from Input().") + .AsDispensable(); + AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator."); + AddAttr>("target_lod", + "The target level 0 LoD from Attr().") + .SetDefault(std::vector{}); + AddComment(R"DOC(LoDReset operator + +Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or +Attr(target_lod), or set LoD for Input(X) if it doesn't have one. +Currently the lod_reset operator only supports the reset of level 0 LoD. +At least one of Input(TargetLoD) and Attr(target_lod) must be set, +and if both of them are set, Input(TargetLoD) will be chosen as the +target LoD. + +An example: +Given a float LoDTensor X with shape (6, 1), its transpose form represents + + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + +with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like + + [1.0, 2.0], [3.0, 4.0, 5.0], [6.0]. + +If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and +the sequences that the LoDTensor Output(Out) contains becomes: + + [1.0, 2.0, 3.0, 4.0], [5.0, 6.0]. + +)DOC"); + } +}; + +class LoDResetGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad, + ops::LoDResetGradOp); +REGISTER_OP_CPU_KERNEL(lod_reset, + ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CPU_KERNEL( + lod_reset_grad, ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8bfc8bd3bf06037d7fcd387dee0514a1e4c6a0f9 --- /dev/null +++ b/paddle/fluid/operators/lod_reset_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lod_reset_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + lod_reset, ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a10efee0bdd8c58d23c05bb85f0f882d801848fe --- /dev/null +++ b/paddle/fluid/operators/lod_reset_op.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LoDResetKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out = ctx.Output("Out"); + auto* in = ctx.Input("X"); + auto* lod_t = ctx.Input("TargetLoD"); + + std::vector level0; + if (lod_t) { + auto* lod = lod_t->data(); + if (platform::is_gpu_place(ctx.GetPlace())) { + framework::Tensor lod_cpu; + framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(), + &lod_cpu); + lod = lod_cpu.data(); + } + level0 = std::vector(lod, lod + lod_t->numel()); + } else { + level0 = ctx.Attr>("target_lod"); + } + + PADDLE_ENFORCE(level0.size() > 1UL, + "The size of target LoD should be greater than 1."); + PADDLE_ENFORCE(level0[0] == 0, + "Target LoD should be a vector starting from 0."); + PADDLE_ENFORCE(level0.back() == in->dims()[0], + "Target LoD should be a vector end with the " + "first dimension of Input(X)."); + for (size_t i = 0; i < level0.size() - 1; ++i) { + PADDLE_ENFORCE(level0[i + 1] > level0[i], + "Target LoD should be an ascending vector."); + } + + out->ShareDataWith(*in); + // cast level0 to size_t + std::vector ulevel0(level0.size(), 0); + std::transform(level0.begin(), level0.end(), ulevel0.begin(), + [](int a) { return static_cast(a); }); + framework::LoD target_lod; + target_lod.push_back(ulevel0); + out->set_lod(target_lod); + } +}; + +template +class LoDResetGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + + d_x->ShareDataWith(*d_out); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..edc32bcec1441e50e24612789727db9a044cde54 --- /dev/null +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct CopyRange { + size_t begin; + size_t end; +}; + +class LoDTensorToArrayOp : public framework::OperatorBase { + public: + LoDTensorToArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s", + Input("X")) + .Get(); + auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable"))) + .Get(); + auto &out = *detail::Ref(scope.FindVar(Output("Out"))) + .GetMutable(); + auto &items = rank_table.items(); + auto max_seq_len = items[0].length; + auto rank_level = rank_table.level(); + + PADDLE_ENFORCE_LT(rank_level, x.lod().size(), + "Input should be a LOD tensor, and size is at least %d", + rank_level + 1); + out.resize(max_seq_len); + std::vector> copy_ranges(max_seq_len); + + // set out[i] lod + for (size_t t = 0; t < max_seq_len; t++) { + auto &lod = *out[t].mutable_lod(); + lod.clear(); + for (auto &item : items) { + if (t >= item.length) { + break; + } + size_t start_idx = x.lod()[rank_level][item.index] + t; + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x.lod(), start_idx, start_idx + 1, rank_level + 1); + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(&lod, lod_length); + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); + } + } + for (size_t i = 0; i < max_seq_len; ++i) { + auto &ranges = copy_ranges[i]; + size_t height = std::accumulate( + ranges.begin(), ranges.end(), 0UL, + [](size_t a, const CopyRange &b) { return a + b.end - b.begin; }); + auto x_dim = x.dims(); + x_dim[0] = static_cast(height); + out[i].Resize(x_dim); + out[i].mutable_data(x.place(), x.type()); + size_t offset = 0; + for (auto &each_range : ranges) { + size_t len = each_range.end - each_range.begin; + if (len == 0) { + continue; + } + // out[i][offset: offset+len] = x[each_range.begin: each_range.end] + auto slice = out[i].Slice(static_cast(offset), + static_cast(offset + len)); + + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::Copy(x.Slice(static_cast(each_range.begin), + static_cast(each_range.end)), + x.place(), dev_ctx, &slice); + offset += len; + } + } + } +}; + +class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddInput("RankTable", ""); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class LoDTensorToArrayInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of LoDTensorToArrayOp should not be null."); + PADDLE_ENFORCE( + context->HasInput("RankTable"), + "Input(RankTable) of LoDTensorToArrayOp should not be null."); + + PADDLE_ENFORCE(context->HasOutput("Out"), + "Output(Out) of LoDTensorToArrayOp should not be null."); + + auto x_dim = context->GetInputDim("X"); + // The first dim of each LoDTensor in Output can only be set at run-time.; + // We still have to Resize each LoDTensor in Output. + context->SetOutputDim("Out", x_dim); + } +}; + +class LoDTensorToArrayInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY); + } + } +}; + +class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("array_to_lod_tensor"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("RankTable", Input("RankTable")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp, + ops::LoDTensorToArrayOpProtoMaker, + ops::LoDTensorToArrayInferShape, + ops::LoDTensorToArrayInferVarType, + ops::LoDTensorToArrayGradMaker); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c5cd2956811329a1ac5da9e42e808c2684dc771 --- /dev/null +++ b/paddle/fluid/operators/log_loss_op.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/log_loss_op.h" + +namespace paddle { +namespace operators { + +class LogLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predicted"), + "Input(Predicted) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) must be initialized."); + + auto pred_dims = ctx->GetInputDim("Predicted"); + auto label_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(pred_dims, label_dims); + PADDLE_ENFORCE_EQ(pred_dims.size(), 2, + "The rank of Input(Predicted) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(pred_dims[1], 1, + "Each row of Input(Predicted) contains a real value, " + "so the 2nd dimension of Input(X) must be 1."); + + ctx->SetOutputDim("Loss", {pred_dims[0], 1}); + ctx->ShareLoD("Predicted", "Loss"); + } +}; + +template +class LogLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Predicted", + "The input value (Predicted) of Log loss op." + "Predicted is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Labels", + "The target value (Labels) of Log loss op." + "Labels is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Loss", + "The output tensor with shape [batch_size, 1] " + "which represents the log loss."); + AddAttr("epsilon", "Epsilon in log loss."); + AddComment(R"DOC( +LogLoss Operator. + +Log loss is a loss function used for binary classification. Log Loss quantifies +the accuracy of a classifier by penalising false classifications. Minimising the +Log Loss is equivalent to maximising the accuracy of the classifier. We define +Predicted as the values predicted by our model and Labels as the target ground +truth value. Log loss can evaluate how close the predicted values are to the +target. The shapes of Predicted and Labels are both [batch_size, 1]. +The equation is: + +$$ +Loss = - Labels * log(Predicted + \epsilon) - + (1 - Labels) * log(1 - Predicted + \epsilon) +$$ + +)DOC"); + } +}; + +class LogLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predicted"), + "Input(Predicted) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")), + "Output(Predicted@GRAD) should not be null."); + + auto pred_dims = ctx->GetInputDim("Predicted"); + auto label_dims = ctx->GetInputDim("Labels"); + auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); + PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); + + auto pred_grad_name = framework::GradVarName("Predicted"); + ctx->SetOutputDim(pred_grad_name, pred_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad, + ops::LogLossGradOp); +REGISTER_OP_CPU_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CPU_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..c164a6d04056c2e9d9302b609c1ff4f2b2c4a3f3 --- /dev/null +++ b/paddle/fluid/operators/log_loss_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/log_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..67fac7cfe55d1d50063afac925863a8fb2eb63a8 --- /dev/null +++ b/paddle/fluid/operators/log_loss_op.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class LogLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* loss_out = ctx.Output("Loss"); + + loss_out->mutable_data(ctx.GetPlace()); + + auto epsilon = static_cast(ctx.Attr("epsilon")); + + auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); + auto label = EigenVector::Flatten(*ctx.Input("Labels")); + + auto loss = EigenVector::Flatten(*loss_out); + auto& place = *ctx.template device_context().eigen_device(); + + loss.device(place) = (-(label * (prediction + epsilon).log()) - + ((static_cast(1) - label) * + (static_cast(1) - prediction + epsilon).log())); + } +}; + +template +class LogLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto epsilon = static_cast(ctx.Attr("epsilon")); + + auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); + auto label = EigenVector::Flatten(*ctx.Input("Labels")); + + auto* dloss = ctx.Input(framework::GradVarName("Loss")); + auto* dpred = ctx.Output(framework::GradVarName("Predicted")); + + auto dl = EigenVector::Flatten(*dloss); + auto& place = *ctx.template device_context().eigen_device(); + + if (dpred) { + dpred->mutable_data(ctx.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dpred); + dx.device(place) = dl * (-(label / (prediction + epsilon)) + + ((static_cast(1) - label) / + (static_cast(1) - prediction + epsilon))); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ff49895df1979bde9dc9f9c7b92601a2a65241da --- /dev/null +++ b/paddle/fluid/operators/logical_op.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/logical_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + OpComment comment; + AddInput("X", + string::Sprintf("(LoDTensor) Left hand operand of %s operator", + comment.type)); + AddInput("Y", + string::Sprintf("(LoDTensor) Right hand operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors. +Each element of Out is calculated by %s +)DOC", + comment.type, comment.equation)); + } +}; + +template +class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + OpComment comment; + AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors. +Each element of Out is calculated by %s +)DOC", + comment.type, comment.equation)); + } +}; + +template +class BinaryLogicalOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of %s operator must not be null", comment.type); + PADDLE_ENFORCE(context->HasInput("Y"), + "Input(Y) of %s operator must not be null", comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y), + "The number of elements in X and Y should be same"); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +template +class UnaryLogicalOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of %s operator must not be null", comment.type); + auto dim_x = context->GetInputDim("X"); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class LogicalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // LogicalOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::LogicalOp, \ + ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::LogicalOp, \ + ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, + paddle::operators::LogicalAndFunctor); +REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, + paddle::operators::LogicalOrFunctor); +REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, + paddle::operators::LogicalNotFunctor); +REGISTER_BINARY_LOGICAL_OP(logical_xor, + "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, + paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/logical_op.cu b/paddle/fluid/operators/logical_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2b17444061252b714cd9bbaadc7fcf877628ef89 --- /dev/null +++ b/paddle/fluid/operators/logical_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/logical_op.h" + +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, + paddle::operators::LogicalAndFunctor); +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, + paddle::operators::LogicalOrFunctor); +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, + paddle::operators::LogicalNotFunctor); +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, + paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/logical_op.h b/paddle/fluid/operators/logical_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f6d5866c2c8e4ce54e9556a1acf69414dba523d2 --- /dev/null +++ b/paddle/fluid/operators/logical_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct LogicalAndFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; } +}; + +template +struct LogicalOrFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; } +}; + +template +struct LogicalNotFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a) const { return !a; } +}; + +template +struct LogicalXorFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + return (a || b) && !(a && b); + } +}; + +template +class BinaryLogicalOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* out = context.Output("Out"); + Functor binary_func; + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); + } +}; + +template +class UnaryLogicalOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + Functor unary_func; + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), + out->mutable_data(context.GetPlace()), unary_func); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::BinaryLogicalOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); + +#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::UnaryLogicalOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c555f1a3fa228215812b7d3291e882b0d42bb64 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lookup_table_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + + ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("W")->type()), + ctx.device_context()); + } +}; + +class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("W", + "An input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "Ids must be a column vector with rank = 2. " + "The 2nd dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update") + .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(-1); + AddComment(R"DOC( +Lookup Table Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +class LookupTableOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { return "lookup_table_grad"; } +}; + +class LookupTableOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("W")->type()), + ctx.device_context()); + } +}; + +class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarDesc::SELECTED_ROWS); + } else { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, + ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); +REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, + ops::LookupTableOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, + ops::LookupTableKernel); +REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, + ops::LookupTableGradKernel); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..801adba5a440ebda9506717603e8f9665ea9e6ba --- /dev/null +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -0,0 +1,176 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/lookup_table_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void LookupTable(T* output, const T* table, const int64_t* ids, + const int64_t N, const int64_t K, const int64_t D, + const int64_t padding_idx) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int64_t id = ids[idy]; + PADDLE_ASSERT(id >= 0); + PADDLE_ASSERT(id < N); + T* out = output + idy * D; + const T* tab = table + id * D; + for (int i = idx; i < D; i += BlockDimX) { + if (PaddingFlag) { + if (id == padding_idx) + out[i] = static_cast(0); + else + out[i] = tab[i]; + } else { + out[i] = tab[i]; + } + } + idy += BlockDimY * GridDimX; + } +} + +template +__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, + const int64_t N, const int64_t K, + const int64_t D) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int id = ids[idy]; + PADDLE_ASSERT(id >= 0); + PADDLE_ASSERT(id < N); + const T* out = output + idy * D; + T* tab = table + id * D; + for (int i = idx; i < D; i += BlockDimX) { + paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + } + idy += BlockDimY * GridDimX; + } +} + +template +class LookupTableCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* table_t = context.Input("W"); + auto* ids_t = context.Input("Ids"); + auto* output_t = context.Output("Out"); + int64_t padding_idx = context.Attr("padding_idx"); + + size_t N = table_t->dims()[0]; + size_t D = table_t->dims()[1]; + size_t K = ids_t->numel(); + auto* ids = ids_t->data(); + auto* table = table_t->data(); + auto* output = output_t->mutable_data(context.GetPlace()); + + dim3 threads(128, 8); + dim3 grids(8, 1); + + if (padding_idx == -1) + LookupTable< + T, 128, 8, 8, + false><<>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTable< + T, 128, 8, 8, + true><<>>( + output, table, ids, N, K, D, padding_idx); + } +}; + +template +class LookupTableGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = + context.template device_context(); + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto* ids = context.Input("Ids"); + auto* table = context.Input("W"); + auto* d_output = context.Input(framework::GradVarName("Out")); + auto* d_table = context.Output(framework::GradVarName("W")); + + auto* ids_data = ids->data(); + auto ids_dim = ids->dims(); + + auto stream = dev_ctx.stream(); + // copy GPU memory to CPU pinned memory + framework::Vector new_rows; + new_rows.resize(ids_dim[0]); + auto gpu_place = boost::get(context.GetPlace()); + + // TODO(yuyang18): Strange code here. + memory::Copy(platform::CPUPlace(), + new_rows.CUDAMutableData(context.GetPlace()), gpu_place, + ids_data, ids_dim[0] * sizeof(int64_t), stream); + + d_table->set_rows(new_rows); + + auto* d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_dim[0], table->dims()[1]}); + d_table_value->mutable_data(context.GetPlace()); + + auto* d_table_data = d_table_value->data(); + auto* d_output_data = d_output->data(); + PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); + memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, + d_output->numel() * sizeof(T), stream); + + } else { + auto ids_t = context.Input("Ids"); + auto d_output_t = context.Input(framework::GradVarName("Out")); + auto d_table_t = context.Output(framework::GradVarName("W")); + + int N = d_table_t->dims()[0]; + int D = d_table_t->dims()[1]; + int K = ids_t->numel(); + const int64_t* ids = ids_t->data(); + const T* d_output = d_output_t->data(); + T* d_table = d_table_t->mutable_data(context.GetPlace()); + + auto t = framework::EigenVector::Flatten(*d_table_t); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); + + dim3 threads(128, 8); + dim3 grids(8, 1); + LookupTableGrad<<>>( + d_table, d_output, ids, N, K, D); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel, + ops::LookupTableCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table_grad, + ops::LookupTableGradCUDAKernel, + ops::LookupTableGradCUDAKernel); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d264496882a9e1828953d843d4a18fe4c16b1d24 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_op.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +template +class LookupTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* table_t = context.Input("W"); // float tensor + auto* ids_t = context.Input("Ids"); // int tensor + auto* output_t = context.Output("Out"); // float tensor + int64_t padding_idx = context.Attr("padding_idx"); + + int N = table_t->dims()[0]; + int D = table_t->dims()[1]; + auto* ids = ids_t->data(); + auto* table = table_t->data(); + auto* output = output_t->mutable_data(context.GetPlace()); + + if (padding_idx == -1) { + for (int64_t i = 0; i < ids_t->numel(); ++i) { + PADDLE_ENFORCE_LT(ids[i], N); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + } + } else { + for (int64_t i = 0; i < ids_t->numel(); ++i) { + if (ids[i] == padding_idx) { + memset(output + i * D, 0, D * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], N); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + } + } + } + } +}; + +template +class LookupTableGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto* ids = context.Input("Ids"); + auto* table = context.Input("W"); + auto* d_output = context.Input(framework::GradVarName("Out")); + auto* d_table = context.Output(framework::GradVarName("W")); + + auto* ids_data = ids->data(); + auto ids_dim = ids->dims(); + + framework::Vector new_rows; + new_rows.reserve(ids_dim[0]); + for (int64_t i = 0; i < ids_dim[0]; i++) { + new_rows.push_back(ids_data[i]); + } + d_table->set_rows(new_rows); + + auto* d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_dim[0], table->dims()[1]}); + d_table_value->mutable_data(context.GetPlace()); + + d_table->set_height(table->dims()[0]); + + auto* d_output_data = d_output->data(); + auto* d_table_data = d_table_value->data(); + + PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); + memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + } else { + auto* ids = context.Input("Ids"); + auto* d_output = context.Input(framework::GradVarName("Out")); + auto* d_table = context.Output(framework::GradVarName("W")); + auto* table = context.Input("W"); + + auto* ids_data = ids->data(); + auto ids_dim = ids->dims(); + + int N = table->dims()[0]; + int D = d_output->dims()[1]; + + auto* d_output_data = d_output->data(); + auto* d_table_data = d_table->mutable_data(context.GetPlace()); + + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + for (int64_t i = 0; i < ids->numel(); ++i) { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c84507f231c6d3c4c5d6e33719fafc2752876f03 --- /dev/null +++ b/paddle/fluid/operators/lrn_op.cc @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lrn_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + auto x_v = framework::EigenVector::Flatten(input); + + const int start = -(n - 1) / 2; + const int end = start + n; + + auto e_mid = framework::EigenTensor::From(*mid); + e_mid = e_mid.constant(k); + + auto e_x = framework::EigenTensor::From(input); + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch >= 0 && ch < C) { + auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + s += alpha * r.square(); + } + } + } + } + + auto out_e = framework::EigenVector::Flatten(*out); + out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + } +}; +template struct LRNFunctor; +template struct LRNFunctor; + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + T ratio = -2 * alpha * beta; + auto x_g_e = framework::EigenVector::Flatten(*x_g); + x_g_e = x_g_e.constant(0.0); + + auto e_x = framework::EigenTensor::From(x); + auto e_x_g = framework::EigenTensor::From(*x_g); + auto e_out = framework::EigenTensor::From(out); + auto e_out_g = framework::EigenTensor::From(out_g); + auto e_mid = framework::EigenTensor::From(mid); + + const int start = -(n - 1) / 2; + const int end = start + n; + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + auto i_x = e_x.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_x_g = e_x_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_out_g = e_out_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_mid = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g = i_mid.pow(-beta) * i_out_g; + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch < 0 || ch >= C) { + continue; + } + + auto c_out = e_out.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_mid = e_mid.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_out_g = e_out_g.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g += ratio * c_out_g * c_out * i_x / c_mid; + } + } + } + } +}; +template struct LRNGradFunctor; +template struct LRNGradFunctor; + +class LRNOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LRNOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MidOut"), + "MidOut(Out) of LRNOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); + + ctx->SetOutputDim("Out", x_dim); + ctx->SetOutputDim("MidOut", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class LRNOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input of LRN operator. " + "It must be a 4D tenor with NCHW format."); + AddOutput("Out", + "(Tensor) The output of LRN operator, which is also the 4D " + "tensor with NCHW format."); + AddOutput("MidOut", + "(Tensor) Middle result of LRN operator. It's computed in " + "forward process and also used in backward process."); + + AddAttr("n", + "(int default 5) " + "n is the \"adjacent\" kernel that maps " + "at the same spatial position.") + .SetDefault(5) + .GreaterThan(0); + + AddAttr("k", + "(float, default 2.0) " + "k is the bias.") + .SetDefault(2.0) + .GreaterThan(0.0); + + AddAttr("alpha", + "(float, default 0.0001) " + "alpha is the scale number.") + .SetDefault(0.0001) + .GreaterThan(0.0); + + AddAttr("beta", + "(float, default 0.75) " + "beta is the power number.") + .SetDefault(0.75) + .GreaterThan(0.0); + + AddComment(R"DOC( +Local Response Normalization Operator. + +This operator comes from the paper: +<>. + +The original formula is: + +$$ +Output(i, x, y) = Input(i, x, y) / \left( +k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)} +(Input(j, x, y))^2 +\right)^{\beta} +$$ + +Function implementation: + +Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4. +And dimensions 0 ~ 3 represent batch size, feature maps, rows, +and columns, respectively. + +Input and Output in the formula above is for each map(i) of one image, and +Input(i, x, y), Output(i, x, y) represents an element in an image. + +C is the number of feature maps of one image. n is a hyper-parameter +configured when operator is initialized. The sum in the denominator +is the sum of the same positions in the neighboring maps. + +)DOC"); + } +}; + +class LRNOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker, lrn_grad, ops::LRNOpGrad); +REGISTER_OP_CPU_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CPU_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..03112bf3e03595a521c22cd914f414f026970c10 --- /dev/null +++ b/paddle/fluid/operators/lrn_op.cu @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lrn_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C, + int H, int W, int size, T k, T alpha) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + + in += offset; + mid += offset; + const int step = H * W; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + + T accum = 0; + int index = 0; + while (index < C + post_pad) { + if (index < C) { + T val = in[index * step]; + accum += val * val; + } + if (index >= size) { + T val = in[(index - size) * step]; + accum -= val * val; + } + if (index >= post_pad) { + mid[(index - post_pad) * step] = k + accum * alpha; + } + ++index; + } + } +} + +template +__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid, + T negative_beta, T* out) { + const int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < input_size) { + out[index] = in[index] * pow(mid[index], negative_beta); + } +} + +template +void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, + T* outputs, T* mid, int N, int C, int H, int W, int n, T k, + T alpha, T beta) { + int img_size = N * H * W; + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + auto& dev_ctx = ctx.template device_context(); + KeCMRNormFillScale<<>>( + img_size, inputs, mid, C, H, W, n, k, alpha); + + int input_size = N * H * W * C; + grid_size = (input_size + block_size - 1) / block_size; + KeCMRNormOutput<<>>( + input_size, inputs, mid, -beta, outputs); +} + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + CrossMapNormal( + ctx, input.data(), out->mutable_data(ctx.GetPlace()), + mid->mutable_data(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta); + } +}; + +template struct LRNFunctor; +template struct LRNFunctor; + +template +__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, + const T* mid, T* x_g, const T* out_g, int C, + int H, int W, int size, T negative_beta, + T ratio) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + x += offset; + out += offset; + mid += offset; + out_g += offset; + x_g += offset; + + const int step = H * W; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + + int index = 0; + T accum = 0; + // TODO(gongwb): optimize this with thread shared array. + while (index < C + post_pad) { + if (index < C) { + x_g[index * step] = 0.0; + accum += out_g[index * step] * out[index * step] / mid[index * step]; + } + if (index >= size) { + accum -= out_g[(index - size) * step] * out[(index - size) * step] / + mid[(index - size) * step]; + } + if (index >= post_pad) { + x_g[(index - post_pad) * step] += + out_g[(index - post_pad) * step] * + pow(mid[(index - post_pad) * step], negative_beta) - + ratio * x[(index - post_pad) * step] * accum; + } + ++index; + } + } +} + +template +void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, + const T* out, const T* mid, T* x_g, const T* out_g, + int N, int C, int H, int W, int n, T alpha, T beta) { + int img_size = N * H * W; + + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + auto& dev_ctx = ctx.template device_context(); + KeCMRNormDiff<<>>( + img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, + 2.0f * alpha * beta); +} + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + CrossMapNormalGrad(ctx, x.data(), out.data(), mid.data(), + x_g->mutable_data(ctx.GetPlace()), out_g.data(), + N, C, H, W, n, alpha, beta); + } +}; + +template struct LRNGradFunctor; +template struct LRNGradFunctor; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b7b78b459145bae5483e3f12b3d872c679823740 --- /dev/null +++ b/paddle/fluid/operators/lrn_op.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta); +}; + +template +class LRNKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + + // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta) + // x represents inputs + // f(x) represents outputs + void Compute(const framework::ExecutionContext& ctx) const override { + // input + const Tensor& x = *ctx.Input("X"); + auto x_dims = x.dims(); + + // NCHW + int N = x_dims[0]; + int C = x_dims[1]; + int H = x_dims[2]; + int W = x_dims[3]; + + Tensor* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + // MidOut save the intermediate result for backward + Tensor* mid = ctx.Output("MidOut"); + mid->mutable_data(ctx.GetPlace()); + + int n = ctx.Attr("n"); + T alpha = ctx.Attr("alpha"); + T beta = ctx.Attr("beta"); + T k = ctx.Attr("k"); + + PADDLE_ENFORCE(n > 0, "n should >= 0"); + PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0"); + PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); + PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); + + LRNFunctor f; + f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); + } +}; + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta); +}; + +/** + * \brief Backward calculation for normalization with across maps. + * + * Function implementation: + * + * The implementation of this Function is derived from the + * CrossMapNormalFunc implementation. + * + * InputGrad = OutputGrad * MidOut ^ (-beta) + * -- upper + * + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue + * -- lower + * + * The data of inputs/outputs format is the same as the forward interface + * and is NCHW. + * + * The upper and lower is the same as forward. The logic of the sum + * is also the same as forward. + */ +template +class LRNGradKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor& x = *ctx.Input("X"); + const Tensor& out = *ctx.Input("Out"); + const Tensor& out_g = *ctx.Input(framework::GradVarName("Out")); + const Tensor& mid = *ctx.Input("MidOut"); + + auto x_g = ctx.Output(framework::GradVarName("X")); + x_g->mutable_data(ctx.GetPlace()); + + auto x_dims = x.dims(); + int N = x_dims[0]; + int C = x_dims[1]; + int H = x_dims[2]; + int W = x_dims[3]; + + int n = ctx.Attr("n"); + T alpha = ctx.Attr("alpha"); + T beta = ctx.Attr("beta"); + + LRNGradFunctor f; + f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1f1b5f235f991e7a4d84c815bc5dda74ab64752 --- /dev/null +++ b/paddle/fluid/operators/lstm_op.cc @@ -0,0 +1,281 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstm_op.h" + +namespace paddle { +namespace operators { + +class LSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(BatchGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchGate) of LSTM should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + int frame_size = in_dims[1] / 4; + auto w_dims = ctx->GetInputDim("Weight"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "The rank of Input(Weight) should be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], frame_size, + "The first dimension of Input(Weight) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, + "The second dimension of Input(Weight) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + } + + framework::DDim out_dims({in_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchGate", in_dims); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->ShareLoD("Input", "Hidden"); + ctx->ShareLoD("Input", "Cell"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) the first input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X 4D), where T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) the initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) the initial cell state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddInput("Weight", + "(Tensor) the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights, which contains two parts: " + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddOutput("Hidden", + "(LoDTensor) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("BatchGate", + "(LoDTensor) This LoDTensor contains input gate, forget gate " + "and output gate after the nonlinear computation. This " + "LoDTensor has the same shape as the reorganized input, which " + "is also be called batch input. The LoD size is 2. The first " + "LoD is the batch offsets and the second LoD contains the " + "indexes, which denote the position of reorganized sequence " + "in the raw input.") + .AsIntermediate(); + AddOutput("BatchCellPreAct", + "(LoDTensor) This LoDTensor is obtained in the forward and used " + "in the backward.") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr( + "gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Long-Short Term Memory (LSTM) Operator. + +The defalut implementation is diagonal/peephole connection +(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: + +$$ +i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\ + +f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\ + +\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\ + +o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\ + +c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ + +h_t = o_t \odot act_h(c_t) +$$ + +where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix +of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ +are diagonal weight matrices for peephole connections. In our implementation, +we use vectors to reprenset these diagonal weight matrices. The b terms +denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ +is the non-line activations, such as logistic sigmoid function, and +$i, f, o$ and $c$ are the input gate, forget gate, output gate, +and cell activation vectors, respectively, all of which have the same size as +the cell output activation vector $h$. + +The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ +are the cell input and cell output activation functions and `tanh` is usually +used for them. $\tilde{c_t}$ is also called candidate hidden state, +which is computed based on the current input and the previous hidden state. + +Set `use_peepholes` False to disable peephole connection. The formula +is omitted here, please refer to the paper +http://www.bioinf.jku.at/publications/older/2604.pdf for details. + +Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ +operations on the input $x_{t}$ are NOT included in this operator. +Users can choose to use fully-connect operator before LSTM operator. + +)DOC"); + } +}; + +class LSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cell"), + "Input(Cell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(BatchGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"), + "Input(BatchGate) of LSTM should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + }; + + SetOutGradDim("Input"); + SetOutGradDim("Weight"); + SetOutGradDim("Bias"); + SetOutGradDim("H0"); + SetOutGradDim("C0"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp); +REGISTER_OP_CPU_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..679d02b1f9a1d1f0313b5c8109285c92734e3e5a --- /dev/null +++ b/paddle/fluid/operators/lstm_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c48495533cf5dbf1f46176cff936f7f988a3d48 --- /dev/null +++ b/paddle/fluid/operators/lstm_op.h @@ -0,0 +1,376 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index_lod, *dst, indexed_src); +} + +template +class LSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_t0 = ctx.Input("H0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* batch_gate = ctx.Output("BatchGate"); + batch_gate->mutable_data(ctx.GetPlace()); + auto* hidden_out = ctx.Output("Hidden"); + hidden_out->mutable_data(ctx.GetPlace()); + auto* cell_out = ctx.Output("Cell"); + cell_out->mutable_data(ctx.GetPlace()); + + bool is_reverse = ctx.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); + to_batch(device_ctx, *input, *batch_gate, true, is_reverse); + + auto in_dims = input->dims(); + int frame_size = static_cast(in_dims[1] / 4); + framework::DDim dims({in_dims[0], frame_size}); + + if (bias) { + Tensor b = *bias; + b.Resize({bias->numel(), 1}); + Tensor gate_bias = b.Slice(0, 4 * frame_size); + math::RowwiseAdd add_bias; + add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); + } + + math::LstmMetaValue lstm_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + // the code style in LstmMetaValue will be updated later. + + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + lstm_value.prev_state_value = nullptr; + Tensor ordered_c0; + + framework::Vector order(batch_gate->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); + lstm_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_hidden, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_hidden.mutable_data(dims, ctx.GetPlace()); + batch_cell.mutable_data(dims, ctx.GetPlace()); + batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor out_t = batch_hidden.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_hidden_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } else if (hidden_t0) { + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + Tensor ordered_h0; + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } + + lstm_value.gate_value = gate_t.data(); + lstm_value.output_value = out_t.data(); + lstm_value.state_value = cell_t.data(); + lstm_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); + lstm_value.prev_state_value = lstm_value.state_value; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_hidden.set_lod(batch_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(device_ctx, batch_hidden, *hidden_out); + + batch_cell.set_lod(batch_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(device_ctx, batch_cell, *cell_out); + } +}; + +template +class LSTMGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_out = ctx.Input("Hidden"); + auto* cell_out = ctx.Input("Cell"); + + auto* batch_gate = ctx.Input("BatchGate"); + auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); + + auto* hidden_g = ctx.Input(framework::GradVarName("Hidden")); + + auto* in_g = ctx.Output(framework::GradVarName("Input")); + auto* weight_g = ctx.Output(framework::GradVarName("Weight")); + auto* bias_g = ctx.Output(framework::GradVarName("Bias")); + + auto* h0 = ctx.Input("H0"); + auto* c0 = ctx.Input("C0"); + + auto* h0_g = ctx.Output(framework::GradVarName("H0")); + auto* c0_g = ctx.Output(framework::GradVarName("C0")); + + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + if (weight_g) { + weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, weight_g, static_cast(0.0)); + } + + // ordered_h0/c0 is the reordered hidden/cell initialization. + // ordered_h0_g/c0_g is the reordered gradient of hidden/cell + // initialization. + Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; + framework::Vector order(batch_gate->lod()[2]); + + if (c0) { + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); + } + if (c0 && c0_g) { + ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); + } + + auto in_dims = input->dims(); + auto out_dims = hidden_g->dims(); + int frame_size = static_cast(in_dims[1] / 4); + PADDLE_ENFORCE_EQ(frame_size, out_dims[1]); + + math::LstmMetaValue lstm_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + + math::LstmMetaGrad lstm_grad; + + if (bias && bias_g) { + bias_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, bias_g, static_cast(0.0)); + } + if (bias && bias_g && ctx.Attr("use_peepholes")) { + T* bias_g_data = bias_g->data(); + lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size; + lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size; + } else { + lstm_grad.check_ig_grad = nullptr; + lstm_grad.check_fg_grad = nullptr; + lstm_grad.check_og_grad = nullptr; + } + + math::LoDTensor2BatchFunctor to_batch; + + auto ToBatch = [&batch_gate, &to_batch]( + const DeviceContext& ctx, const framework::LoDTensor& src, + const framework::DDim& dims, framework::LoDTensor& dst) { + dst.mutable_data(dims, ctx.GetPlace()); + dst.set_lod(batch_gate->lod()); + to_batch(ctx, src, dst, false); + }; + + LoDTensor batch_hidden, batch_hidden_g, batch_cell; + ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden); + ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g); + ToBatch(device_ctx, *cell_out, out_dims, batch_cell); + + LoDTensor batch_cell_g, batch_gate_g; + batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); + // TODO(qingqing) support the case output cell has gradient. + // to_batch(device_ctx, *cell_g, batch_cell_g, false); + zero(device_ctx, &batch_cell_g, static_cast(0.0)); + batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); + batch_gate_g.set_lod(batch_gate->lod()); + + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate = batch_gate->Slice(bstart, bend); + Tensor cell = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); + lstm_value.gate_value = gate.data(); + lstm_value.state_value = cell.data(); + lstm_value.state_active_value = cell_pre_act.data(); + + Tensor out_g = batch_hidden_g.Slice(bstart, bend); + Tensor gate_g = batch_gate_g.Slice(bstart, bend); + Tensor cell_g = batch_cell_g.Slice(bstart, bend); + lstm_grad.state_grad = cell_g.data(); + lstm_grad.gate_grad = gate_g.data(); + lstm_grad.output_grad = out_g.data(); + + if (n > 0) { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); + Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); + lstm_value.prev_state_value = cell_pre.data(); + lstm_grad.prev_state_grad = cell_pre_g.data(); + } else { + lstm_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; + } + + int cur_batch_size = bend - bstart; + math::LstmUnitGradFunctor::compute( + device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, + gate_act, cell_act, cand_act); + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_hidden_g, + static_cast(1.0)); + if (weight_g) { + /* backward weight */ + auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_hidden, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + } else { + if (h0 && weight_g) { + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + if (h0 && h0_g) { + ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), + &ordered_h0_g, static_cast(0.0)); + } + } + } + + math::Batch2LoDTensorFunctor to_seq; + if (in_g) { + /* backward data */ + in_g->mutable_data(ctx.GetPlace()); + to_seq(device_ctx, batch_gate_g, *in_g); + } + if (bias && bias_g) { + /* backward bias */ + Tensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + math::ColwiseSum col_sum; + col_sum(device_ctx, batch_gate_g, &gate_bias_g); + } + + if (h0 && h0_g) { + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); + } + if (c0 && c0_g) { + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d33d47e0c3ba8b83be2c06e9884d90e9bb1012e --- /dev/null +++ b/paddle/fluid/operators/lstm_unit_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstm_unit_op.h" + +namespace paddle { +namespace operators { + +class LstmUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("C_prev"), + "Input(C_prev) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("C"), + "Output(C) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("H"), + "Output(H) of LSTM should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto c_prev_dims = ctx->GetInputDim("C_prev"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0], + "Batch size of inputs and states must be equal"); + PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4, + "Dimension of FC should equal to prev state * 4"); + + int b_size = c_prev_dims[0]; // batch size + int s_dim = c_prev_dims[1]; // state dim + ctx->SetOutputDim("C", {b_size, s_dim}); + ctx->SetOutputDim("H", {b_size, s_dim}); + } +}; + +class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "Lstm unit only applies non-linear activations, please make sure" + "that linear tranformation has already been applied to `X`. " + "Linear tranformation can be applied by adding a `fc` layer"); + AddInput( + "C_prev", + "The cell state tensor of last time-step in the Lstm Unit operator."); + AddOutput("C", "The cell tensor of Lstm Unit operator."); + AddOutput("H", "The hidden state tensor of Lstm Unit operator."); + AddAttr("forget_bias", + "(float, default 0.0) " + "The forget bias of Lstm Unit.") + .SetDefault(0.0); + AddComment(R"DOC( +Lstm Unit Operator + +Equation: + +$$ +i, f, o, j = split(X) \\ +C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\ +H = C * sigm(o) +$$ + +)DOC"); + } +}; + +class LstmUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")), + "Input(C@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")), + "Input(H@GRAD) should not be null"); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("C_prev"), + ctx->GetInputDim("C_prev")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad, + ops::LstmUnitGradOp); +REGISTER_OP_CPU_KERNEL(lstm_unit, + ops::LstmUnitKernel, + ops::LstmUnitKernel); +REGISTER_OP_CPU_KERNEL( + lstm_unit_grad, ops::LstmUnitGradKernel, + ops::LstmUnitGradKernel); diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..12ebffca37f995111bbeb2a1e8b30ea2fe35c74d --- /dev/null +++ b/paddle/fluid/operators/lstm_unit_op.cu @@ -0,0 +1,179 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* Acknowledgement: the following code is strongly inspired by +https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu +*/ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cross_entropy_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__device__ Dtype cuda_sigmoid(const Dtype x) { + return Dtype(1) / (Dtype(1) + exp(-x)); +} + +template +__device__ Dtype cuda_tanh(const Dtype x) { + return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x)); +} + +template +__global__ void LSTMUnitKernel(const int nthreads, const int dim, + const T* C_prev, const T* X, T* C, T* H, + const T forget_bias) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + + const T* X_offset = X + 4 * dim * n; + const T i = cuda_sigmoid(X_offset[d]); + const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias); + const T o = cuda_sigmoid(X_offset[2 * dim + d]); + const T g = cuda_tanh(X_offset[3 * dim + d]); + const T c_prev = C_prev[index]; + const T c = f * c_prev + i * g; + C[index] = c; + const T tanh_c = cuda_tanh(c); + H[index] = o * tanh_c; + } +} + +template +__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim, + const T* C_prev, const T* X, const T* C, + const T* H, const T* C_diff, + const T* H_diff, T* C_prev_diff, + T* X_diff, const T forget_bias) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + const T* X_offset = X + 4 * dim * n; + T* c_prev_diff = C_prev_diff + index; + T* X_diff_offset = X_diff + 4 * dim * n; + T* i_diff = X_diff_offset + d; + T* f_diff = X_diff_offset + 1 * dim + d; + T* o_diff = X_diff_offset + 2 * dim + d; + T* g_diff = X_diff_offset + 3 * dim + d; + + const T i = cuda_sigmoid(X_offset[d]); + const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias); + const T o = cuda_sigmoid(X_offset[2 * dim + d]); + const T g = cuda_tanh(X_offset[3 * dim + d]); + const T c_prev = C_prev[index]; + const T c = C[index]; + const T tanh_c = cuda_tanh(c); + const T c_term_diff = + C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); + *c_prev_diff = c_term_diff * f; + *i_diff = c_term_diff * g * i * (1 - i); + *f_diff = c_term_diff * c_prev * f * (1 - f); + *o_diff = H_diff[index] * tanh_c * o * (1 - o); + *g_diff = c_term_diff * i * (1 - g * g); + } +} + +template +class LstmUnitOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + auto* x_tensor = ctx.Input("X"); + auto* c_prev_tensor = ctx.Input("C_prev"); + auto* c_tensor = ctx.Output("C"); + auto* h_tensor = ctx.Output("H"); + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + int b_size = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + const T* X = x_tensor->data(); + const T* C_prev = c_prev_tensor->data(); + + T* C = c_tensor->mutable_data(ctx.GetPlace()); + T* H = h_tensor->mutable_data(ctx.GetPlace()); + + int block = 512; + int n = b_size * D; + int grid = (n + block - 1) / block; + + LSTMUnitKernel<<>>(n, D, C_prev, X, C, H, forget_bias); + } +}; + +template +class LstmUnitGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + auto x_tensor = ctx.Input("X"); + auto c_prev_tensor = ctx.Input("C_prev"); + auto c_tensor = ctx.Input("C"); + auto h_tensor = ctx.Input("H"); + + auto hdiff_tensor = ctx.Input(framework::GradVarName("H")); + auto cdiff_tensor = ctx.Input(framework::GradVarName("C")); + + auto xdiff_tensor = ctx.Output(framework::GradVarName("X")); + auto c_prev_diff_tensor = + ctx.Output(framework::GradVarName("C_prev")); + + auto* X = x_tensor->data(); + auto* C_prev = c_prev_tensor->data(); + auto* C = c_tensor->data(); + auto* H = h_tensor->data(); + + auto* H_diff = hdiff_tensor->data(); + auto* C_diff = cdiff_tensor->data(); + + auto* C_prev_diff = c_prev_diff_tensor->mutable_data(ctx.GetPlace()); + auto* X_diff = xdiff_tensor->mutable_data(ctx.GetPlace()); + + int N = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + int block = 512; + int n = N * D; + int grid = (n + block - 1) / block; + + LSTMUnitGradientKernel<<>>(n, D, C_prev, X, C, H, C_diff, + H_diff, C_prev_diff, X_diff, + forget_bias); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, + ops::LstmUnitOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, + ops::LstmUnitGradOpCUDAKernel); diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9f2370fe690a45f49c0138fbd1303d7bfd6dacd0 --- /dev/null +++ b/paddle/fluid/operators/lstm_unit_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* Acknowledgement: the following code is strongly inspired by +https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h +*/ + +#pragma once +#include "glog/logging.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); +} + +template +inline T tanh(T x) { + return 2. * sigmoid(2. * x) - 1.; +} + +template +class LstmUnitKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto* x_tensor = ctx.Input("X"); + auto* c_prev_tensor = ctx.Input("C_prev"); + auto* c_tensor = ctx.Output("C"); + auto* h_tensor = ctx.Output("H"); + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + int b_size = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + T* C = c_tensor->mutable_data(ctx.GetPlace()); + T* H = h_tensor->mutable_data(ctx.GetPlace()); + + const T* X = x_tensor->data(); + const T* C_prev = c_prev_tensor->data(); + + for (int n = 0; n < b_size; ++n) { + for (int d = 0; d < D; ++d) { + const T i = sigmoid(X[d]); + const T f = sigmoid(X[1 * D + d] + forget_bias); + const T o = sigmoid(X[2 * D + d]); + const T g = tanh(X[3 * D + d]); + const T c_prev = C_prev[d]; + const T c = f * c_prev + i * g; + C[d] = c; + const T tanh_c = tanh(c); + H[d] = o * tanh_c; + } + C_prev += D; + X += 4 * D; + C += D; + H += D; + } + } +}; + +template +class LstmUnitGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto x_tensor = ctx.Input("X"); + auto c_prev_tensor = ctx.Input("C_prev"); + auto c_tensor = ctx.Input("C"); + auto h_tensor = ctx.Input("H"); + + auto hdiff_tensor = ctx.Input(framework::GradVarName("H")); + auto cdiff_tensor = ctx.Input(framework::GradVarName("C")); + + auto xdiff_tensor = ctx.Output(framework::GradVarName("X")); + auto c_prev_diff_tensor = + ctx.Output(framework::GradVarName("C_prev")); + + auto* X = x_tensor->data(); + auto* C_prev = c_prev_tensor->data(); + auto* C = c_tensor->data(); + auto* H = h_tensor->data(); + + auto* H_diff = hdiff_tensor->data(); + auto* C_diff = cdiff_tensor->data(); + + auto* C_prev_diff = c_prev_diff_tensor->mutable_data(ctx.GetPlace()); + auto* X_diff = xdiff_tensor->mutable_data(ctx.GetPlace()); + + int N = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + for (int n = 0; n < N; ++n) { + for (int d = 0; d < D; ++d) { + T* c_prev_diff = C_prev_diff + d; + T* i_diff = X_diff + d; + T* f_diff = X_diff + 1 * D + d; + T* o_diff = X_diff + 2 * D + d; + T* g_diff = X_diff + 3 * D + d; + + const T i = sigmoid(X[d]); + const T f = sigmoid(X[1 * D + d] + forget_bias); + const T o = sigmoid(X[2 * D + d]); + const T g = tanh(X[3 * D + d]); + const T c_prev = C_prev[d]; + const T c = C[d]; + const T tanh_c = tanh(c); + const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c); + *c_prev_diff = c_term_diff * f; + *i_diff = c_term_diff * g * i * (1 - i); + *f_diff = c_term_diff * c_prev * f * (1 - f); + *o_diff = H_diff[d] * tanh_c * o * (1 - o); + *g_diff = c_term_diff * i * (1 - g * g); + } + C_prev += D; + X += 4 * D; + C += D; + H += D; + C_diff += D; + H_diff += D; + X_diff += 4 * D; + C_prev_diff += D; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d30edf5c3cbabe7223a459a5f60b7b9aa51af9a --- /dev/null +++ b/paddle/fluid/operators/lstmp_op.cc @@ -0,0 +1,331 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstmp_op.h" + +namespace paddle { +namespace operators { + +class LSTMPOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTMP operator should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Projection"), + "Output(Projection) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(BatchGate) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchCellPreAct) of LSTMP operator should not be " + "null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"), + "Output(BatchHidden) of LSTMP operator should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 2, + "Input(X)'s rank of LSTMP operator must be 2."); + + int frame_size = in_dims[1] / 4; + auto w_dims = ctx->GetInputDim("Weight"); + auto proj_dims = ctx->GetInputDim("ProjWeight"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "The rank of Input(Weight) should be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1], + "The first dimension of Input(Weight) " + "should be %d.", + proj_dims[1]); + PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, + "The second dimension of Input(Weight) " + "should be 4 * %d.", + frame_size); + + PADDLE_ENFORCE_EQ(proj_dims.size(), 2, + "The rank of Input(ProjWeight) should be 2."); + PADDLE_ENFORCE_EQ(proj_dims[0], frame_size, + "The first dimension of Input(ProjWeight) " + "should be %d.", + frame_size); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(C0) of LSTMP operator should not be null after " + "Input(H0) provided."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]}); + } + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + } + + framework::DDim out_dims({in_dims[0], frame_size}); + framework::DDim proj_out_dims({in_dims[0], proj_dims[1]}); + ctx->SetOutputDim("Projection", proj_out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchGate", in_dims); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->SetOutputDim("BatchHidden", out_dims); + ctx->ShareLoD("Input", "Projection"); + ctx->ShareLoD("Input", "Cell"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) the input for sequence data, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X 4D), where T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) the initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) the initial cell state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `C0` should not be null if `H0` provided.") + .AsDispensable(); + AddInput("Weight", + "(Tensor) the learnable hidden-hidden weights." + " - The shape is (P x 4D), where P is the projection layer size " + "and D is the hidden size." + " - Weight = {W_cr, W_ir, W_fr, W_or}"); + AddInput("ProjWeight", + "(Tensor) the learnable weight of the projection layer." + " - The shape is (D x P), where P is the recurrent projection " + "layer size and D is the hidden size." + " - ProjWeight = {W_rh}"); + AddInput("Bias", + "(Tensor) the learnable biases, which contains two parts: " + "input-hidden biases and peephole connections weights if " + "setting `use_peepholes` to `True`. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddOutput("Projection", + "(LoDTensor) the projection of the hidden state of LSTMP " + "operator. The shape is (T x P), and LoD is the same with the " + "`Input`."); + AddOutput("Cell", + "(LoDTensor) the cell state of LSTMP operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("BatchGate", + "(LoDTensor) This LoDTensor contains input gate, forget gate " + "and output gate after the activations. This LoDTensor has the " + "same shape as the reorganized input, which is also be called " + "batch input. The LoD size is 2. The first-level LoD is the " + "batch offsets and the second contains the indices, which " + "denotes the position of reorganized sequence in the raw input.") + .AsIntermediate(); + AddOutput("BatchCellPreAct", + "(LoDTensor) the pre-activation cell state reorganized in batch. " + "This LoDTensor is obtained in the forward and used in the " + "backward.") + .AsIntermediate(); + AddOutput("BatchHidden", + "(LoDTensor) the hidden state reorganized in batch. " + "This LoDTensor is obtained in the forward and used in the " + "backward.") + .AsIntermediate(); + AddOutput("OrderedP0", + "(Tensor) the projection of the initial hidden state " + "H0. This is a tensor with shape (N x P), where N is the " + "batch size and P is the hidden size.") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTMP.") + .SetDefault(false); + AddAttr( + "gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("proj_activation", + "(string, default: tanh)" + "The activation for projection output, " + "`tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator. + +LSTMP has a separate projection layer after the LSTM layer, projecting the +original hidden state to a lower-dimensional one, which is proposed to reduce +the number of total parameters and furthermore computational complexity for +the LSTM, espeacially for the case that the size of output units is relative +large (https://research.google.com/pubs/archive/43905.pdf). + +The formula is as follows: + +$$ +i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\ + +f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\ + +\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\ + +o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\ + +c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ + +h_t = o_t \odot act_h(c_t) \\ + +r_t = \overline{act_h}(W_{rh}h_t) +$$ + +where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix +of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ +are diagonal weight matrices for peephole connections. In our implementation, +we use vectors to reprenset these diagonal weight matrices. The b terms +denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ +is the activation, such as logistic sigmoid function, and +$i, f, o$ and $c$ are the input gate, forget gate, output gate, +and cell activation vectors, respectively, all of which have the same size as +the cell output activation vector $h$. Here $h$ is usually called the hidden +state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also +called the candidate hidden state, whose computation is based on the current +input and previous hidden state. + +The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ +are the cell input and cell output activation functions and `tanh` is usually +used for them. $\overline{act_h}$ is the activation function for the +projection output, usually using `identity` or same as $act_h$. + +Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ +operations on the input $x_{t}$ are NOT included in this operator. +Users can choose to use fully-connected operator before LSTMP operator. + +)DOC"); + } +}; + +class LSTMPGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Projection"), + "Input(Projection) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cell"), + "Input(Cell) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTMP operator should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(BatchGate) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"), + "Input(BatchGate) of LSTMP operator should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + }; + + SetOutGradDim("Input"); + SetOutGradDim("Weight"); + SetOutGradDim("ProjWeight"); + SetOutGradDim("Bias"); + SetOutGradDim("H0"); + SetOutGradDim("C0"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad, + ops::LSTMPGradOp); +REGISTER_OP_CPU_KERNEL( + lstmp, ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CPU_KERNEL( + lstmp_grad, ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bcefb94c75b8577fefb1ee3b440dc5fb045562d5 --- /dev/null +++ b/paddle/fluid/operators/lstmp_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstmp_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lstmp, ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CUDA_KERNEL( + lstmp_grad, + ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h new file mode 100644 index 0000000000000000000000000000000000000000..22ef4721860a493fded98cf32b40a2aceb851a5c --- /dev/null +++ b/paddle/fluid/operators/lstmp_op.h @@ -0,0 +1,496 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index, *dst, indexed_src); +} + +template +class LSTMPKernel : public framework::OpKernel { + public: + template + void ActCompute(const math::detail::ActivationType act_type, const Device& d, + X x, Y y) const { + if (act_type == math::detail::ActivationType::kIdentity) + y.device(d) = x; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kTanh) + TanhFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kReLU) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_t0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Output("OrderedP0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* batch_gate = ctx.Output("BatchGate"); + batch_gate->mutable_data(ctx.GetPlace()); + auto* proj_out = ctx.Output("Projection"); + proj_out->mutable_data(ctx.GetPlace()); + auto* cell_out = ctx.Output("Cell"); + cell_out->mutable_data(ctx.GetPlace()); + + bool is_reverse = ctx.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); + to_batch(device_ctx, *input, *batch_gate, true, is_reverse); + + auto in_dims = input->dims(); + int frame_size = static_cast(in_dims[1] / 4); + framework::DDim dims({in_dims[0], frame_size}); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); + + if (bias) { + Tensor b = *bias; + b.Resize({bias->numel(), 1}); + Tensor gate_bias = b.Slice(0, 4 * frame_size); + math::RowwiseAdd add_bias; + add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); + } + + math::LstmMetaValue lstmp_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + // the code style in LstmpMetaValue will be updated later. + + lstmp_value.check_ig = bias_data + 4 * frame_size; + lstmp_value.check_fg = lstmp_value.check_ig + frame_size; + lstmp_value.check_og = lstmp_value.check_fg + frame_size; + } else { + lstmp_value.check_ig = nullptr; + lstmp_value.check_fg = nullptr; + lstmp_value.check_og = nullptr; + } + lstmp_value.prev_state_value = nullptr; + Tensor ordered_c0; + + framework::Vector order(batch_gate->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTMP reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); + lstmp_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_proj, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); + auto* batch_hidden = ctx.Output("BatchHidden"); + batch_hidden->mutable_data(dims, ctx.GetPlace()); // T x D + batch_proj.mutable_data(proj_dims, ctx.GetPlace()); // T x P + batch_cell.mutable_data(dims, ctx.GetPlace()); // T x D + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + auto proj_act = math::detail::GetActivationType( + ctx.Attr("proj_activation")); + auto& place = *ctx.template device_context().eigen_device(); + + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + Tensor proj_t = batch_proj.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_proj_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } else if (hidden_t0) { + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTMP reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + + Tensor ordered_h0; + ordered_proj0->mutable_data(ctx.GetPlace()); + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, + *proj_weight, false, static_cast(1.0), + ordered_proj0, static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + ActCompute(cell_act, place, proj0_dev, proj0_dev); + } + math::matmul(device_ctx, *ordered_proj0, false, + *weight, false, static_cast(1.0), + &gate_t, static_cast(1.0)); + } + + lstmp_value.gate_value = gate_t.data(); + lstmp_value.output_value = hidden_t.data(); + lstmp_value.state_value = cell_t.data(); + lstmp_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); + lstmp_value.prev_state_value = lstmp_value.state_value; + math::matmul(device_ctx, hidden_t, false, *proj_weight, + false, static_cast(1.0), &proj_t, + static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj_t_dev = EigenMatrix::From(proj_t); + ActCompute(cell_act, place, proj_t_dev, proj_t_dev); + } + } + + math::Batch2LoDTensorFunctor to_seq; + batch_proj.set_lod(batch_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(device_ctx, batch_proj, *proj_out); + + batch_cell.set_lod(batch_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(device_ctx, batch_cell, *cell_out); + } +}; + +template +class LSTMPGradKernel : public framework::OpKernel { + public: + template + void ActGradCompute(const math::detail::ActivationType act_type, + const Device& d, X x, Y y, DX dx, DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == math::detail::ActivationType::kIdentity) + dx.device(d) = dy; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kTanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kReLU) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); + auto* bias = ctx.Input("Bias"); + + auto* proj_out = ctx.Input("Projection"); + auto* cell_out = ctx.Input("Cell"); + + auto* batch_gate = ctx.Input("BatchGate"); + auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); + auto* batch_hidden = ctx.Input("BatchHidden"); + + auto* projection_g = + ctx.Input(framework::GradVarName("Projection")); + + auto* in_g = ctx.Output(framework::GradVarName("Input")); + auto* weight_g = ctx.Output(framework::GradVarName("Weight")); + auto* proj_weight_g = + ctx.Output(framework::GradVarName("ProjWeight")); + auto* bias_g = ctx.Output(framework::GradVarName("Bias")); + + auto* h0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Input("OrderedP0"); + auto* c0 = ctx.Input("C0"); + + auto* h0_g = ctx.Output(framework::GradVarName("H0")); + auto* c0_g = ctx.Output(framework::GradVarName("C0")); + + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + if (weight_g) { + weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, weight_g, static_cast(0.0)); + } + if (proj_weight_g) { + proj_weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, proj_weight_g, static_cast(0.0)); + } + + // ordered_h0/c0 is the reordered hidden/cell initialization. + // ordered_h0_g/c0_g is the reordered gradient of hidden/cell + // initialization. + Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; + + framework::Vector order(batch_gate->lod()[2]); + + if (c0) { + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); + } + if (c0 && c0_g) { + ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); + } + + auto in_dims = input->dims(); + auto out_dims = cell_out->dims(); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); + int frame_size = static_cast(in_dims[1] / 4); + PADDLE_ENFORCE_EQ(frame_size, out_dims[1]); + + math::LstmMetaValue lstmp_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + lstmp_value.check_ig = bias_data + 4 * frame_size; + lstmp_value.check_fg = lstmp_value.check_ig + frame_size; + lstmp_value.check_og = lstmp_value.check_fg + frame_size; + } else { + lstmp_value.check_ig = nullptr; + lstmp_value.check_fg = nullptr; + lstmp_value.check_og = nullptr; + } + + math::LstmMetaGrad lstmp_grad; + + if (bias && bias_g) { + bias_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, bias_g, static_cast(0.0)); + } + if (bias && bias_g && ctx.Attr("use_peepholes")) { + T* bias_g_data = bias_g->data(); + lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size; + lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size; + } else { + lstmp_grad.check_ig_grad = nullptr; + lstmp_grad.check_fg_grad = nullptr; + lstmp_grad.check_og_grad = nullptr; + } + + math::LoDTensor2BatchFunctor to_batch; + + auto ToBatch = [&batch_gate, &to_batch]( + const DeviceContext& ctx, const framework::LoDTensor& src, + const framework::DDim& dims, framework::LoDTensor& dst) { + dst.mutable_data(dims, ctx.GetPlace()); + dst.set_lod(batch_gate->lod()); + to_batch(ctx, src, dst, false); + }; + + LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell; + batch_hidden_g.mutable_data(out_dims, ctx.GetPlace()); + ToBatch(device_ctx, *proj_out, proj_dims, batch_proj); // T x P + ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g); // T x P + ToBatch(device_ctx, *cell_out, out_dims, batch_cell); // T x D + + LoDTensor batch_cell_g, batch_gate_g; + batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); + // TODO(qingqing) support the case output cell has gradient. + // to_batch(device_ctx, *cell_g, batch_cell_g, false); + zero(device_ctx, &batch_cell_g, static_cast(0.0)); + batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); + batch_gate_g.set_lod(batch_gate->lod()); + + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + auto proj_act = math::detail::GetActivationType( + ctx.Attr("proj_activation")); + auto& place = *ctx.template device_context().eigen_device(); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor cur_proj = batch_proj.Slice(bstart, bend); + Tensor proj_g = batch_proj_g.Slice(bstart, bend); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto cur_proj_dev = EigenMatrix::From(cur_proj); + auto proj_g_dev = EigenMatrix::From(proj_g); + ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev, + proj_g_dev); + } + /* hidden state backwarad */ + Tensor out_g = batch_hidden_g.Slice(bstart, bend); + math::matmul(device_ctx, proj_g, false, *proj_weight, + true, static_cast(1.0), &out_g, + static_cast(0.0)); + /* projection weight backward*/ + if (proj_weight_g) { + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + math::matmul(device_ctx, hidden_t, true, proj_g, + false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } + + Tensor gate = batch_gate->Slice(bstart, bend); + Tensor cell = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); + lstmp_value.gate_value = gate.data(); + lstmp_value.state_value = cell.data(); + lstmp_value.state_active_value = cell_pre_act.data(); + + Tensor gate_g = batch_gate_g.Slice(bstart, bend); + Tensor cell_g = batch_cell_g.Slice(bstart, bend); + lstmp_grad.state_grad = cell_g.data(); + lstmp_grad.gate_grad = gate_g.data(); + lstmp_grad.output_grad = out_g.data(); + + if (n > 0) { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); + Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); + lstmp_value.prev_state_value = cell_pre.data(); + lstmp_grad.prev_state_grad = cell_pre_g.data(); + } else { + lstmp_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; + } + + int cur_batch_size = bend - bstart; + math::LstmUnitGradFunctor::compute( + device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, + gate_act, cell_act, cand_act); + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_proj_g, + static_cast(1.0)); + if (weight_g) { + /* weight backward*/ + auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_proj, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + } else { + if (h0 && weight_g) { + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + if (weight_g) { + math::matmul(device_ctx, *ordered_proj0, true, + gate_g, false, static_cast(1.0), + weight_g, static_cast(1.0)); + } + } + if (h0 && (h0_g || proj_weight_g)) { + ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); + Tensor proj0_g; + proj0_g.Resize({in_dims[0], proj_weight->dims()[1]}); + proj0_g.mutable_data(ctx.GetPlace()); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), &proj0_g, + static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + auto proj0_g_dev = EigenMatrix::From(proj0_g); + ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev, + proj0_g_dev); + } + if (h0_g) { + math::matmul( + device_ctx, proj0_g, false, *proj_weight, true, + static_cast(1.0), &ordered_h0_g, static_cast(0.0)); + } + if (proj_weight_g) { + math::matmul(device_ctx, ordered_h0, true, + proj0_g, false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } + } + } + } + + math::Batch2LoDTensorFunctor to_seq; + if (in_g) { + /* backward data */ + in_g->mutable_data(ctx.GetPlace()); + to_seq(device_ctx, batch_gate_g, *in_g); + } + if (bias && bias_g) { + /* backward bias */ + Tensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + math::ColwiseSum col_sum; + col_sum(device_ctx, batch_gate_g, &gate_bias_g); + } + + if (h0 && h0_g) { + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); + } + if (c0 && c0_g) { + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc31befb20526e84aae1804756d2d44a785aa229 --- /dev/null +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -0,0 +1,122 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/margin_rank_loss_op.h" + +namespace paddle { +namespace operators { + +class MarginRankLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null."); + auto label_dims = ctx->GetInputDim("Label"); + auto x1_dims = ctx->GetInputDim("X1"); + auto x2_dims = ctx->GetInputDim("X2"); + PADDLE_ENFORCE( + (label_dims == x1_dims) && (x1_dims == x2_dims) && + (label_dims.size() == 2) && (label_dims[1] == 1), + "All inputs must be 2-D tensor with shape [batch_size x 1]."); + ctx->SetOutputDim("Activated", label_dims); + ctx->SetOutputDim("Out", label_dims); + } +}; + +template +class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X1", + "(2-D tensor with shape [batch_size x 1]) The score for " + "one item X1 to be ranked, from pairwise ranking model."); + AddInput("X2", + "(2-D tensor with shape [batch_size x 1]) The score for " + "another item X2 to be ranked, from pairwise ranking model."); + AddInput("Label", + "(2-D tensor with shape [batch_size x 1]) " + "The label indicating X1 ranked higher than X2 or not, " + "can only be +1 or -1."); + AddOutput("Activated", + "(2-D tensor with shape [batch_size x 1]) Intermediate tensor " + "to indicate whether each element of Output(Out) is activated.") + .AsIntermediate(); + AddOutput("Out", + "(2-D tensor with shape [batch_size x 1]) " + "The output loss of MarginRankLoss operator."); + AddAttr("margin", "(scalar, default 0) Margin for MarginRankLossOp.") + .SetDefault(static_cast(0)); + AddComment(R"DOC( +MarginRankLoss Operator. + +This operator measures the loss given a pair of training sample +{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` +indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss +is calculated as: + +$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$ + +The attribute `margin` here helps make the predictions more robust. +Denote the item ranked higher as the positive sample, otherwise the negative +sample. If the score of the two samples satisfies + +$positive sample - negative sample < margin$ + +the pair of samples will contribute to the final loss, which will backpropagate +and train the ranking model to enlarge the difference between the two scores. + +For batch input with size `batch_size`, `X1`, `X2` and `Label` +all have the same shape [batch_size x 1]. + +)DOC"); + } +}; + +class MarginRankLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Activated"), + "Intermediate(Activated) shouldn't be null."); + auto dims = ctx->GetInputDim("Label"); + ctx->SetOutputDim(framework::GradVarName("X1"), dims); + ctx->SetOutputDim(framework::GradVarName("X2"), dims); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp, + ops::MarginRankLossOpMaker, margin_rank_loss_grad, + ops::MarginRankLossGradOp); +REGISTER_OP_CPU_KERNEL( + margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CPU_KERNEL( + margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca4593a48d6d3eccff81dfd621ea1198e5bad880 --- /dev/null +++ b/paddle/fluid/operators/margin_rank_loss_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/margin_rank_loss_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL( + margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..934a5da0f804f7cf7dc176a9ee4e1b72261ef008 --- /dev/null +++ b/paddle/fluid/operators/margin_rank_loss_op.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +struct ReLU { + HOSTDEVICE T operator()(const T& val) const { + return val > 0 ? val : static_cast(0); + } +}; + +template +struct Heaviside { + HOSTDEVICE T operator()(const T& val) const { + return static_cast(val > 0 ? 1 : 0); + } +}; + +template +class MarginRankLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* act_t = ctx.Output("Activated"); + + auto* label_t = ctx.Input("Label"); + auto* x1_t = ctx.Input("X1"); + auto* x2_t = ctx.Input("X2"); + + out_t->mutable_data(ctx.GetPlace()); + act_t->mutable_data(ctx.GetPlace()); + + auto margin = static_cast(ctx.Attr("margin")); + auto out = framework::EigenVector::Flatten(*out_t); + auto act = framework::EigenVector::Flatten(*act_t); + + auto label = framework::EigenVector::Flatten(*label_t); + auto x1 = framework::EigenVector::Flatten(*x1_t); + auto x2 = framework::EigenVector::Flatten(*x2_t); + + auto& dev = *ctx.template device_context().eigen_device(); + out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU()); + act.device(dev) = out.unaryExpr(Heaviside()); + } +}; + +template +class MarginRankLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_x1_t = + ctx.Output(framework::GradVarName("X1")); + auto* d_x2_t = + ctx.Output(framework::GradVarName("X2")); + + auto* act_t = ctx.Input("Activated"); + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* label_t = ctx.Input("Label"); + + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto act = framework::EigenVector::Flatten(*act_t); + auto label = framework::EigenVector::Flatten(*label_t); + auto& dev = *ctx.template device_context().eigen_device(); + + // compute d_x1 + if (d_x1_t) { + d_x1_t->mutable_data(ctx.GetPlace()); + auto d_x1 = framework::EigenVector::Flatten(*d_x1_t); + d_x1.device(dev) = -d_out * act * label; + } + // compute d_x2 + if (d_x2_t) { + d_x2_t->mutable_data(ctx.GetPlace()); + auto d_x2 = framework::EigenVector::Flatten(*d_x2_t); + d_x2.device(dev) = d_out * act * label; + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt similarity index 100% rename from paddle/operators/math/CMakeLists.txt rename to paddle/fluid/operators/math/CMakeLists.txt diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc new file mode 100644 index 0000000000000000000000000000000000000000..b73d976d1b3e6dcf99e5cc525263282b1253c600 --- /dev/null +++ b/paddle/fluid/operators/math/context_project.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/context_project.h" + +namespace paddle { +namespace operators { +namespace math { + +template class ContextProjectFunctor; +template class ContextProjectFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu new file mode 100644 index 0000000000000000000000000000000000000000..bbd36a6e8f54833f15ee0c991228c10b7f74f272 --- /dev/null +++ b/paddle/fluid/operators/math/context_project.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/math/context_project.h" + +namespace paddle { +namespace operators { +namespace math { + +template class ContextProjectFunctor; +template class ContextProjectFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h new file mode 100644 index 0000000000000000000000000000000000000000..2fe593ec3af9d07a2cbafc69e8d3a52e2c43e76b --- /dev/null +++ b/paddle/fluid/operators/math/context_project.h @@ -0,0 +1,306 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +/* + * \brief Context projection concatenates features in adjacent time-steps in + * a sequence. The i-th row of the output is the concatenation of + * context_length rows of the input. The context_length rows are the + * consecutive rows from the i+shift_start row. + * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor. + * + * \param in Input data. + * \param Shape The shape of Input data: + * [mini-batch, input_hidden_size]. + * + * \param padding_data Padding data. + * \param Shape The shape of Padding data: + * [up_pad + down_pad, input_hidden_size]. + * + * \param col Col data. + * \param Shape The shape of Col data: + * [mini-batch, context_length * input_hidden_size]. + * + * For a mini-batch of 2 variable lengths sentences, containing 3, and 1 + * time-steps: + * + * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3, + * 4]. + * Besides, for the sake of simplicity, we assume M=1 and N=2. + * + * X = [[a1, a2; + * b1, b2; + * c1, c2] + * [d1, d2]] + * + * This is to say that input (X) has 4 words and the dimension of each word + * representation is 2. + * + * - Case1: + * If context_start is -1 and padding_trainable is false, we use zero to pad + * instead of learned weight to pad, + * and the context_length is 3, the output (Out) is: + * + * Out =[[0, 0, a1, a2, b1, b2; + * a1, a2, b1, b2, c1, c2; + * b1, b2, c1, c2, 0, 0 ] + * [0, 0, d1, d2, 0, 0 ]] + * + * - Case2: + * If context_start is -1 and padding_trainable is true, we use learned weight + * to pad, + * and the context_length is 3, the output (Out) is: + * + * Out = [[w1, w2, a1, a2, b1, b2; + * a1, a2, b1, b2, c1, c2; + * b1, b2, c1, c2, w3, w4] + * [w1, w2, d1, d2, w3, w4]] + * + */ + +template +class ContextProjectFunctor { + public: + void operator()(const DeviceContext& context, const LoDTensor& in, + const Tensor& padding_data, bool padding_trainable, + const int context_start, const int context_length, + const int context_stride, const int up_pad, + const int down_pad, Tensor* col) { + auto lod_level_0 = in.lod()[0]; + + math::Im2ColFunctor im2col_ocf; + + std::vector dilation({1, 1}); + std::vector padding({up_pad, 0, down_pad, 0}); + std::vector stride({context_stride, 1}); + + int input_row_begin, input_row_end; + int sequence_height, sequence_width; + sequence_width = in.dims()[1]; + + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + input_row_begin = (context_start > 0) + ? static_cast(lod_level_0[i]) + context_start + : static_cast(lod_level_0[i]); + input_row_end = static_cast(lod_level_0[i + 1]); + + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + + if (input_row_begin < input_row_end) { + Tensor in_t = in.Slice(input_row_begin, input_row_end); + + std::vector output_shape( + {sequence_height, 1, 1, context_length, + sequence_width}); // output_height, output_width, + // input_channels, filter_height, filter_width + out_t.Resize(framework::make_ddim(output_shape)); + + std::vector input_shape( + {1, input_row_end - input_row_begin, + sequence_width}); // input_channels, input_height, input_width + in_t.Resize(framework::make_ddim(input_shape)); + im2col_ocf(context, in_t, dilation, stride, padding, &out_t); + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + if (padding_trainable) { + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + + // add up trainable data + out_t.Resize({sequence_height * context_length, sequence_width}); + + if (up_pad > 0) { // add up pad + int padding_rows = std::min( + up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); + + for (int k = 0; k < padding_rows; ++k) { + int padding_size = + k + context_length < up_pad ? context_length : up_pad - k; + Tensor out_t_sub = out_t.Slice(k * context_length, + k * context_length + padding_size); + Tensor w_sub = padding_data.Slice(k, k + padding_size); + framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub); + } + } + if (down_pad > 0) { // add down pad + int down_pad_begin_row = + std::max(0, + (sequence_height - context_start - context_length) + 1) + + 1; + int padding_begin = std::max(0, context_start - sequence_height); + int padding_size = + sequence_height - context_start >= context_length + ? 1 + : context_length - (sequence_height - context_start); + if (context_start >= sequence_height) padding_size = context_length; + int padding_idx = padding_begin; + for (int t = 0; t + down_pad_begin_row <= sequence_height; + ++t, ++padding_size) { + if (context_start >= sequence_height) padding_size = context_length; + if (padding_size > context_length) { + padding_size = context_length; + padding_idx++; + } + if (padding_begin > 0 || sequence_height == context_start) + padding_idx = padding_begin + t; + + Tensor out_t_sub = out_t.Slice( + (down_pad_begin_row + t) * context_length - padding_size, + (down_pad_begin_row + t) * context_length); + Tensor w_sub = padding_data.Slice( + up_pad + padding_idx, up_pad + padding_idx + padding_size); + framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub); + } + } + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + } +}; + +template +class ContextProjectGradFunctor { + public: + void operator()(const DeviceContext& context, const LoDTensor& in, + bool padding_trainable, const int context_start, + const int context_length, const int context_stride, + const int up_pad, const int down_pad, bool pad_grad, + bool input_grad, Tensor* padding_data, Tensor* col) { + auto lod_level_0 = in.lod()[0]; + + math::Col2ImFunctor col2im_ocf; + + std::vector dilation({1, 1}); + std::vector padding({up_pad, 0, down_pad, 0}); + std::vector stride({context_stride, 1}); + + int input_row_begin, input_row_end; + int sequence_height, sequence_width; + sequence_width = in.dims()[1]; + + if (input_grad) { + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + input_row_begin = (context_start > 0) + ? static_cast(lod_level_0[i]) + context_start + : static_cast(lod_level_0[i]); + input_row_end = static_cast(lod_level_0[i + 1]); + + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + + if (input_row_begin < input_row_end) { + Tensor in_t = in.Slice(input_row_begin, input_row_end); + + std::vector output_shape( + {sequence_height, 1, 1, context_length, + sequence_width}); // output_height, output_width, + // input_channels, filter_height, filter_width + out_t.Resize(framework::make_ddim(output_shape)); + + std::vector input_shape( + {1, input_row_end - input_row_begin, + sequence_width}); // input_channels, input_height, input_width + in_t.Resize(framework::make_ddim(input_shape)); + + col2im_ocf(context, out_t, dilation, stride, padding, &in_t); + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + } + if (pad_grad) { + if (padding_trainable) { + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + out_t.Resize({sequence_height * context_length, sequence_width}); + + if (up_pad > 0) { + int padding_rows = std::min( + up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); + + for (int k = 0; k < padding_rows; ++k) { + int padding_size = + k + context_length < up_pad ? context_length : up_pad - k; + Tensor out_t_sub = out_t.Slice(k * context_length, + k * context_length + padding_size); + Tensor w_sub = padding_data->Slice(k, k + padding_size); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); + } + } + if (down_pad > 0) { + int down_pad_begin_row = + std::max( + 0, (sequence_height - context_start - context_length) + 1) + + 1; + int padding_begin = std::max(0, context_start - sequence_height); + int padding_size = + sequence_height - context_start >= context_length + ? 1 + : context_length - (sequence_height - context_start); + if (context_start >= sequence_height) padding_size = context_length; + int padding_idx = padding_begin; + for (int t = 0; t + down_pad_begin_row <= sequence_height; + ++t, ++padding_size) { + if (context_start >= sequence_height) + padding_size = context_length; + if (padding_size > context_length) { + padding_size = context_length; + padding_idx++; + } + if (padding_begin > 0 || sequence_height == context_start) + padding_idx = padding_begin + t; + + Tensor out_t_sub = out_t.Slice( + (down_pad_begin_row + t) * context_length - padding_size, + (down_pad_begin_row + t) * context_length); + Tensor w_sub = padding_data->Slice( + up_pad + padding_idx, up_pad + padding_idx + padding_size); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); + } + } + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc new file mode 100644 index 0000000000000000000000000000000000000000..701a9c23c0da3afbb643e9a821b7b74e69170710 --- /dev/null +++ b/paddle/fluid/operators/math/cos_sim_functor.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cos_sim_functor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimDyFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + for (size_t row_id = 0; row_id < rows; ++row_id) { + auto xy_norm_prod = x_norm[row_id] * y_norm[0]; + auto dz_data = dz[row_id]; + auto z_data = z[row_id]; + auto* x_data = x + cols * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm[0] * y_norm[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + } + } + } +}; + +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu new file mode 100644 index 0000000000000000000000000000000000000000..0323680870ad835afca5a896f80d3abde0aad11c --- /dev/null +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cos_sim_functor.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) { + int grid_size = blockDim.x * gridDim.x; + T y_norm_data = y_norm[0]; + for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; + row_id += grid_size) { + T xy_norm_prod = x_norm[row_id] * y_norm_data; + T dz_data = dz[row_id]; + T z_data = z[row_id]; + const T* x_data = x + cols * row_id; + T reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + T y_norm_square = y_norm_data * y_norm_data; + T reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + platform::CudaAtomicAdd(dy + i, dy_data); + } + } +} + +template +struct CosSimDyFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + const int block_size = 512; + dim3 threads(block_size, 1); + dim3 grid(1, (rows + block_size - 1) / block_size); + CosSimDyKernel<<>>( + x_norm, y_norm, x, y, z, dz, rows, cols, dy); + } +}; + +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..445d94f975f3448cc09c21be8e0a13d73d002382 --- /dev/null +++ b/paddle/fluid/operators/math/cos_sim_functor.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimFunctor { + CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto* x = x_ + cols_ * row_id; + T xx = 0, xy = 0, yy = 0; + if (same_row) { + auto* y = y_ + cols_ * row_id; + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + y_norm_[row_id] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } else { // This can be wrote in a better way. + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y_[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + if (row_id == 0) y_norm_[0] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } + } + + T* x_norm_; + T* y_norm_; + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +template +struct CosSimGradFunctor { + CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + + auto* dx = dx_ + cols_ * row_id; + auto* x = x_ + cols_ * row_id; + auto* y = y_ + cols_ * row_id; + + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto reciprocal_x_norm_square = 1 / x_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDxFunctor { + CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + auto* x = x_ + cols_ * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto* dx = dx_ + cols_ * row_id; + auto reciprocal_x_norm_square = 1 / x_norm_square; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDyFunctor { + void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm, + const T* x, const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc new file mode 100644 index 0000000000000000000000000000000000000000..76abd03ff8b75e595461f41301c41ffe57d78686 --- /dev/null +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cross_entropy.h" + +namespace paddle { +namespace operators { +namespace math { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class CrossEntropyFunctor { + public: + void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, + const framework::Tensor* prob, + const framework::Tensor* labels, const bool softLabel) { + const int batch_size = prob->dims()[0]; + if (softLabel) { + auto in = EigenMatrix::From(*prob); + auto lbl = EigenMatrix::From(*labels); + auto loss = EigenMatrix::From(*out); + + loss.device(*ctx.eigen_device()) = + -((lbl * in.log().unaryExpr(math::TolerableValue())) + .sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(batch_size, 1))); + } else { + const int class_num = prob->dims()[1]; + const T* prob_data = prob->data(); + T* loss_data = out->data(); + + const int64_t* label_data = labels->data(); + for (int i = 0; i < batch_size; ++i) { + int index = i * class_num + label_data[i]; + loss_data[i] = -math::TolerableValue()(std::log(prob_data[index])); + } + } + } +}; + +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu new file mode 100644 index 0000000000000000000000000000000000000000..39222c484c2fe847aec70b65d3d01745b8eea336 --- /dev/null +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cross_entropy.h" + +namespace paddle { +namespace operators { +namespace math { + +namespace { +template +__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, + const int N, const int D) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { + PADDLE_ASSERT(label[i] >= 0 && label[i] < D); + Y[i] = -math::TolerableValue()(log(X[i * D + label[i]])); + } +} + +template +__device__ __forceinline__ T sum_single_warp(T val) { + val += __shfl_down(val, 16); + val += __shfl_down(val, 8); + val += __shfl_down(val, 4); + val += __shfl_down(val, 2); + val += __shfl_down(val, 1); + return val; +} + +// CUDA do not support dynamic arrary in template +// https://stackoverflow.com/questions/20497209 +template +struct SharedMemory { + // Ensure that we won't compile any un-specialized types + __device__ T* GetPointer() { return NULL; } +}; + +template <> +struct SharedMemory { + __device__ float* GetPointer() { + extern __shared__ float s_float[]; + return s_float; + } +}; + +template <> +struct SharedMemory { + __device__ double* GetPointer() { + extern __shared__ double s_double[]; + return s_double; + } +}; + +template +__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, + const int class_num) { + int tid = threadIdx.x; + SharedMemory d_sum_shared; + T* d_sum = d_sum_shared.GetPointer(); + d_sum[tid] = 0; + + int cur_idx = tid; + int next_idx = blockIdx.x * class_num + tid; + while (cur_idx < class_num) { + d_sum[tid] += + math::TolerableValue()(std::log(X[next_idx])) * label[next_idx]; + next_idx += blockDim.x; + cur_idx += blockDim.x; + } + __syncthreads(); + + for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { + if (tid < stride) d_sum[tid] += d_sum[tid + stride]; + __syncthreads(); + } + + T val = d_sum[tid]; + val = sum_single_warp(val); + if (tid == 0) Y[blockIdx.x] = -val; +} +} // namespace + +using Tensor = framework::Tensor; + +template +class CrossEntropyFunctor { + public: + void operator()(const platform::CUDADeviceContext& ctx, + framework::Tensor* out, const framework::Tensor* prob, + const framework::Tensor* labels, bool softLabel) { + const T* prob_data = prob->data(); + T* loss_data = out->mutable_data(ctx.GetPlace()); + + int batch_size = prob->dims()[0]; + int class_num = prob->dims()[1]; + + if (softLabel) { + const T* label_data = labels->data(); + int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num))); + + SoftCrossEntropyKernel<<< + batch_size, block, block * sizeof(T), + reinterpret_cast(ctx).stream()>>>( + loss_data, prob_data, label_data, class_num); + } else { + const int64_t* label_data = labels->data(); + int block = 512; + int grid = (batch_size + block - 1) / block; + CrossEntropyKernel<<>>( + loss_data, prob_data, label_data, batch_size, class_num); + } + } +}; + +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h new file mode 100644 index 0000000000000000000000000000000000000000..2fe216a805383ae0d7e8d008af2838652fcf87c6 --- /dev/null +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +template +class CrossEntropyFunctor { + public: + void operator()(const DeviceContext& context, framework::Tensor* out, + const framework::Tensor* prob, + const framework::Tensor* labels, const bool softLabel); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu new file mode 100644 index 0000000000000000000000000000000000000000..7b75e593071eaeb72bcfc687b6ff22b7cf4f143f --- /dev/null +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +// A Cuda kernel to compute the depthwise convolution forward pass +// in NCHW format. +template +__global__ void KernelDepthwiseConv( + const int nthreads, const T* const input_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, T* const output_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if (index < nthreads) { + const int batch = index / output_channels / output_height / output_width; + const int c_out = (index / output_height / output_width) % output_channels; + const int h_out = (index / output_width) % output_height; + const int w_out = index % output_width; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; + } + } + output_data[index] = value; + } +} + +// CUDA kernel to compute the depthwise convolution backprop w.r.t input. +template +__global__ void KernelDepthwiseConvInputGrad( + const int nthreads, const T* const output_grad_data, + const T* const filter_data, const int batch_size, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const input_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int batch = index / input_channels / input_height / input_width; + const int c_in = (index / input_height / input_width) % input_channels; + const int h_in = (index / input_width) % input_height; + const int w_in = index % input_width; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + (h_in - filter_height + padding_height + stride_height) / stride_height; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + + int h_out_end = (h_in + padding_height) / stride_height; + h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; + + int w_out_start = + (w_in - filter_width + padding_width + stride_width) / stride_width; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + + int w_out_end = (w_in + padding_width) / stride_width; + w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + padding_height - h_out * stride_height; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + padding_width - w_out * stride_width; + const int filter_offset = c_out * filter_height * filter_width + + filter_h * filter_width + filter_w; + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + h_out) * + output_width + + w_out; + value += + output_grad_data[output_grad_offset] * filter_data[filter_offset]; + } + } + } + input_grad_data[index] += value; + } +} + +// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. +template +__global__ void KernelDepthwiseConvFilterGrad( + const int nthreads, const T* const output_grad_data, + const T* const input_data, const int num, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const filter_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int w_out = index % output_width; + const int h_out = (index / output_width) % output_height; + const int c_out = (index / output_width / output_height) % output_channels; + const int batch = (index / output_width / output_height / output_channels); + const int c_in = c_out / filter_multiplier; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = + -padding_height + h_out * stride_height + filter_height; + const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); + } + } + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* filter_data = filter.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConv<<>>( + nthreads, input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + output_data); + } +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* filter_data = filter.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvInputGrad<<>>( + nthreads, output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter_grad->dims()[2]; + const int ksize_width = filter_grad->dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_grad_data = output_grad.data(); + T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvFilterGrad<<>>( + nthreads, output_grad_data, input_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + filter_grad_data); + } +}; + +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; + +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; + +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h new file mode 100644 index 0000000000000000000000000000000000000000..c3081e7a0deb4afc47d826753ecb2556aa6f4522 --- /dev/null +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/fluid/operators/math/detail/CMakeLists.txt similarity index 100% rename from paddle/operators/math/detail/CMakeLists.txt rename to paddle/fluid/operators/math/detail/CMakeLists.txt diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..3af7ba790c489b2fc34b3cb6d56849ce789d2430 --- /dev/null +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -0,0 +1,191 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hostdevice.h" + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +enum ActivationType { + kSigmoid, + kReLU, + kTanh, + kIdentity, +}; + +inline ActivationType GetActivationType(const std::string &type) { + if (type == "sigmoid") { + return ActivationType::kSigmoid; + } else if (type == "relu") { + return ActivationType::kReLU; + } else if (type == "tanh") { + return ActivationType::kTanh; + } else if (type == "identity" || type == "") { + return ActivationType::kIdentity; + } + PADDLE_THROW("Not support type %s.", type); +} + +namespace forward { + +template +DEVICE T Identity(const T a) { + return a; +} + +template +DEVICE T Relu(const T a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +template +DEVICE T Sigmoid(const T a) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + T tmp = (a < min) ? min : ((a > max) ? max : a); + return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); +} + +template +DEVICE T Tanh(const T a) { + T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +} // namespace forward + +namespace backward { + +template +DEVICE T Identity(const T a, const T b) { + return a; +} + +template +DEVICE T Relu(const T a, const T b) { + return a * (b > 0.0 ? 1.0 : 0.0); +} + +template +DEVICE T Sigmoid(const T a, const T b) { + return a * b * (1.0 - b); +} + +template +DEVICE T Tanh(const T a, const T b) { + return a * (1.0 - b * b); +} + +} // namespace backward + +template +struct Active { + typedef T (*Act)(T); + typedef T (*ActGrad)(T, T); +}; + +static DEVICE Active::Act kActFloat[] = { + &forward::Sigmoid, &forward::Relu, &forward::Tanh, + &forward::Identity}; + +static DEVICE Active::ActGrad kActGradFloat[] = { + &backward::Sigmoid, &backward::Relu, &backward::Tanh, + &backward::Identity}; + +static DEVICE Active::Act kActDouble[] = { + &forward::Sigmoid, &forward::Relu, &forward::Tanh, + &forward::Identity}; + +static DEVICE Active::ActGrad kActGradDouble[] = { + &backward::Sigmoid, &backward::Relu, + &backward::Tanh, &backward::Identity}; + +namespace forward { +inline DEVICE float activation(float a, int index) { + return kActFloat[index](a); +} + +inline DEVICE double activation(double a, int index) { + return kActDouble[index](a); +} + +} // namespace forward + +namespace backward { +inline DEVICE float activation(float a, float b, int index) { + return kActGradFloat[index](a, b); +} + +inline DEVICE double activation(double a, double b, int index) { + return kActGradDouble[index](a, b); +} +} // namespace backward + +#ifdef __AVX__ +namespace forward { +namespace avx { +__m256 Relu(const __m256 a); +__m256 Sigmoid(const __m256 a); +__m256 Tanh(const __m256 a); +__m256 Identity(const __m256 a); +} // namespace avx +} // namespace forward + +namespace backward { +namespace avx { +__m256 Relu(const __m256 a, const __m256 b); +__m256 Sigmoid(const __m256 a, const __m256 b); +__m256 Tanh(const __m256 a, const __m256 b); +__m256 Identity(const __m256 a, const __m256 b); +} // namespace avx +} // namespace backward + +static Active<__m256>::Act kActAvx[] = { + &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh, + &forward::avx::Identity}; + +static Active<__m256>::ActGrad kActGradAvx[] = { + &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh, + &backward::avx::Identity}; + +namespace forward { +inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); } +} // namespace forward + +namespace backward { +inline __m256 activation(__m256 a, __m256 b, int index) { + return kActGradAvx[index](a, b); +} +} // namespace backward + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc new file mode 100644 index 0000000000000000000000000000000000000000..838cd30e3d503ddb4734f1114e741fd40b1939c0 --- /dev/null +++ b/paddle/fluid/operators/math/detail/avx_functions.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef __AVX__ + +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +// TODO(qingqing) refine this dependence +#include "paddle/cuda/src/avx_mathfun.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +__m256 Exp(__m256 a) { return exp256_ps(a); } + +namespace forward { +namespace avx { +__m256 Relu(const __m256 a) { + __m256 tmp = _mm256_set1_ps(0.0f); + return _mm256_max_ps(a, tmp); +} + +__m256 Sigmoid(const __m256 a) { + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 tmp = _mm256_max_ps(a, min); + tmp = _mm256_min_ps(tmp, max); + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); + tmp = Exp(tmp); + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); + return tmp; +} + +__m256 Tanh(const __m256 a) { + __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); + __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = _mm256_min_ps(tmp, max); + tmp = Exp(tmp); + return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), + _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), + _mm256_set1_ps(1.0f)); +} + +__m256 Identity(const __m256 a) { return a; } + +} // namespace avx +} // namespace forward + +namespace backward { +namespace avx { +__m256 Relu(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), + _mm256_set1_ps(1.0f))); +} + +__m256 Sigmoid(const __m256 a, const __m256 b) { + return _mm256_mul_ps(_mm256_mul_ps(a, b), + _mm256_sub_ps(_mm256_set1_ps(1.0f), b)); +} + +__m256 Tanh(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b))); +} + +__m256 Identity(const __m256 a, const __m256 b) { return a; } +} // namespace avx +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..75c5c8eb29a34047a22779edcc0fc2f5fbcbab6f --- /dev/null +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -0,0 +1,426 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/gru_compute.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#ifndef __NVCC__ + +template +void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + ActivationType active_gate) { + T r_value_update_gate; + T r_value_reset_gate; + T r_value_reset_output; + T r_prev_out = 0; + T *update_gate = gate_value; + T *reset_gate = gate_value + frame_size; + + for (int i = 0; i < frame_size; i++) { + r_value_update_gate = update_gate[i]; + r_value_reset_gate = reset_gate[i]; + if (prev_output_value) { + r_prev_out = prev_output_value[i]; + } + + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); + + update_gate[i] = r_value_update_gate; + reset_gate[i] = r_value_reset_gate; + reset_output_value[i] = r_value_reset_output; + } +} + +template +void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + ActivationType active_node) { + T r_value_update_gate; + T r_value_frame_state; + T r_prev_out = 0; + T r_output; + T *update_gate = gate_value; + T *frame_state = gate_value + frame_size * 2; + + for (int i = 0; i < frame_size; i++) { + r_value_update_gate = update_gate[i]; + r_value_frame_state = frame_state[i]; + if (prev_output_value) { + r_prev_out = prev_output_value[i]; + } + + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); + + frame_state[i] = r_value_frame_state; + output_value[i] = r_output; + } +} + +template +void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + ActivationType active_gate) { +#ifdef __AVX__ + __m256 r_value_update_gate; + __m256 r_value_reset_gate; + __m256 r_value_reset_output; + __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 *update_gate = (__m256 *)gate_value; + __m256 *reset_gate = (__m256 *)(gate_value + frame_size); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_update_gate = update_gate[i]; + r_value_reset_gate = reset_gate[i]; + if (prev_output_value) { + r_prev_out = ((__m256 *)prev_output_value)[i]; + } + + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); + + update_gate[i] = r_value_update_gate; + reset_gate[i] = r_value_reset_gate; + ((__m256 *)reset_output_value)[i] = r_value_reset_output; + } +#endif +} + +template +void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + ActivationType active_node) { +#ifdef __AVX__ + __m256 r_value_update_gate; + __m256 r_value_frame_state; + __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 r_output; + __m256 *update_gate = (__m256 *)gate_value; + __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_update_gate = update_gate[i]; + r_value_frame_state = frame_state[i]; + if (prev_output_value) { + r_prev_out = ((__m256 *)prev_output_value)[i]; + } + + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); + + frame_state[i] = r_value_frame_state; + ((__m256 *)output_value)[i] = r_output; + } +#endif +} + +template +inline void forward_reset_output(OpResetOutput op_reset_output, + GRUMetaValue value, int frame_size, + int batch_size, ActivationType active_gate) { + for (int b = 0; b < batch_size; b++) { + if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_forward_reset_output( + op_reset_output, value.gate_value, value.reset_output_value, + value.prev_out_value, frame_size, active_gate); + } else { + hl_naive_gru_forward_reset_output( + op_reset_output, value.gate_value, value.reset_output_value, + value.prev_out_value, frame_size, active_gate); + } + + value.gate_value += frame_size * 3; + value.reset_output_value += frame_size; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + } +} + +template +inline void forward_final_output(OpFinalOutput op_final_output, + GRUMetaValue value, int frame_size, + int batch_size, ActivationType active_node) { + for (int b = 0; b < batch_size; b++) { + if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_forward_final_output(op_final_output, value.gate_value, + value.prev_out_value, value.output_value, + frame_size, active_node); + } else { + hl_naive_gru_forward_final_output( + op_final_output, value.gate_value, value.prev_out_value, + value.output_value, frame_size, active_node); + } + + value.gate_value += frame_size * 3; + value.output_value += frame_size; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + } +} + +template +void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, + ActivationType active_node) { + T r_update_gate_value; + T r_update_gate_grad; + T r_frame_state_value; + T r_frame_state_grad; + T r_out_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T *update_gate_value = gate_value; + T *update_gate_grad = gate_grad; + T *frame_state_value = gate_value + frame_size * 2; + T *frame_state_grad = gate_grad + frame_size * 2; + + for (int i = 0; i < frame_size; i++) { + r_update_gate_value = update_gate_value[i]; + r_frame_state_value = frame_state_value[i]; + r_out_grad = output_grad[i]; + if (prev_out_value) { + r_prev_out_value = prev_out_value[i]; + } + if (prev_out_grad) { + r_prev_out_grad = prev_out_grad[i]; + } + + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); + + update_gate_grad[i] = r_update_gate_grad; + frame_state_grad[i] = r_frame_state_grad; + if (prev_out_grad) { + prev_out_grad[i] = r_prev_out_grad; + } + } +} + +template +void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, + ActivationType active_gate) { + T r_update_gate_value; + T r_update_gate_grad; + T r_reset_gate_value; + T r_reset_gate_grad; + T r_reset_output_grad = 0; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T *update_gate_value = gate_value; + T *update_gate_grad = gate_grad; + T *reset_gate_value = gate_value + frame_size; + T *reset_gate_grad = gate_grad + frame_size; + + for (int i = 0; i < frame_size; i++) { + r_update_gate_value = update_gate_value[i]; + r_update_gate_grad = update_gate_grad[i]; + r_reset_gate_value = reset_gate_value[i]; + + if (prev_out_value && prev_out_grad) { + r_reset_output_grad = reset_output_grad[i]; + } + if (prev_out_value) { + r_prev_out_value = prev_out_value[i]; + } + if (prev_out_grad) { + r_prev_out_grad = prev_out_grad[i]; + } + + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); + + update_gate_grad[i] = r_update_gate_grad; + reset_gate_grad[i] = r_reset_gate_grad; + if (prev_out_grad) { + prev_out_grad[i] = r_prev_out_grad; + } + } +} + +template +void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, + ActivationType active_node) { +#ifdef __AVX__ + __m256 r_update_gate_value; + __m256 r_update_gate_grad; + __m256 r_frame_state_value; + __m256 r_frame_state_grad; + __m256 r_out_grad; + __m256 r_prev_out_value = _mm256_set1_ps(0.0f); + __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); + __m256 *update_gate_value = (__m256 *)gate_value; + __m256 *update_gate_grad = (__m256 *)gate_grad; + __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); + __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); + + for (int i = 0; i < frame_size / 8; i++) { + r_update_gate_value = update_gate_value[i]; + r_frame_state_value = frame_state_value[i]; + r_out_grad = ((__m256 *)output_grad)[i]; + if (prev_out_value) { + r_prev_out_value = ((__m256 *)prev_out_value)[i]; + } + if (prev_out_grad) { + r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + } + + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); + + update_gate_grad[i] = r_update_gate_grad; + frame_state_grad[i] = r_frame_state_grad; + if (prev_out_grad) { + ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + } + } +#endif +} + +template +void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, + ActivationType active_gate) { +#ifdef __AVX__ + __m256 r_update_gate_value; + __m256 r_update_gate_grad; + __m256 r_reset_gate_value; + __m256 r_reset_gate_grad; + __m256 r_reset_output_grad = _mm256_set1_ps(0.0f); + __m256 r_prev_out_value = _mm256_set1_ps(0.0f); + __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); + __m256 *update_gate_value = (__m256 *)gate_value; + __m256 *update_gate_grad = (__m256 *)gate_grad; + __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); + __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); + + for (int i = 0; i < frame_size / 8; i++) { + r_update_gate_value = update_gate_value[i]; + r_update_gate_grad = update_gate_grad[i]; + r_reset_gate_value = reset_gate_value[i]; + + if (prev_out_value && prev_out_grad) { + r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; + } + if (prev_out_value) { + r_prev_out_value = ((__m256 *)prev_out_value)[i]; + } + if (prev_out_grad) { + r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + } + + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); + + update_gate_grad[i] = r_update_gate_grad; + reset_gate_grad[i] = r_reset_gate_grad; + if (prev_out_grad) { + ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + } + } +#endif +} + +template +inline void backward_state_grad(OpStateGrad op_state_grad, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + ActivationType active_node) { + for (int b = 0; b < batch_size; b++) { + if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_backward_state_grad( + op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, frame_size, active_node); + } else { + hl_naive_gru_backward_state_grad( + op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, frame_size, active_node); + } + + value.gate_value += frame_size * 3; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + + grad.gate_grad += frame_size * 3; + grad.output_grad += frame_size; + if (grad.prev_out_grad) { + grad.prev_out_grad += frame_size; + } + } +} + +template +inline void backward_reset_grad(OpResetGrad op_reset_grad, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + ActivationType active_gate) { + for (int b = 0; b < batch_size; b++) { + if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_backward_reset_grad( + op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); + } else { + hl_naive_gru_backward_reset_grad( + op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); + } + + value.gate_value += frame_size * 3; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + + grad.gate_grad += frame_size * 3; + grad.reset_output_grad += frame_size; + if (grad.prev_out_grad) { + grad.prev_out_grad += frame_size; + } + } +} + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fbf69d4a85883a68f137945ed8978acb9108b77b --- /dev/null +++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h @@ -0,0 +1,201 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + int batch_size, + ActivationType active_gate) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + reset_output_value += batch_idx * frame_size; + } + + T r_prev_out = 0; + T r_value_reset_output; + T r_value_update_gate = gate_value[frame_idx + frame_size * 0]; + T r_value_reset_gate = gate_value[frame_idx + frame_size * 1]; + + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + r_prev_out = prev_output_value[frame_idx]; + } + + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); + + gate_value[frame_idx + frame_size * 0] = r_value_update_gate; + gate_value[frame_idx + frame_size * 1] = r_value_reset_gate; + reset_output_value[frame_idx] = r_value_reset_output; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + int batch_size, + ActivationType active_node) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + output_value += batch_idx * frame_size; + } + + T r_output; + T r_prev_out = 0; + T r_value_update_gate = gate_value[frame_idx + frame_size * 0]; + T r_value_frame_state = gate_value[frame_idx + frame_size * 2]; + + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + r_prev_out = prev_output_value[frame_idx]; + } + + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); + + gate_value[frame_idx + frame_size * 2] = r_value_frame_state; + output_value[frame_idx] = r_output; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, int batch_size, + ActivationType active_node) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + gate_grad += batch_idx * 3 * frame_size; + output_grad += batch_idx * frame_size; + } + + T r_update_gate_grad; + T r_frame_state_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T r_update_gate_value = gate_value[frame_idx + frame_size * 0]; + T r_frame_state_value = gate_value[frame_idx + frame_size * 2]; + T r_out_grad = output_grad[frame_idx]; + + if (prev_out_value && prev_out_grad) { + if (is_batch) prev_out_value += batch_idx * frame_size; + r_prev_out_value = prev_out_value[frame_idx]; + + if (is_batch) prev_out_grad += batch_idx * frame_size; + r_prev_out_grad = prev_out_grad[frame_idx]; + } + + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); + + gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; + gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad; + if (prev_out_grad) { + prev_out_grad[frame_idx] = r_prev_out_grad; + } +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, int batch_size, + ActivationType active_gate) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + gate_grad += batch_idx * 3 * frame_size; + reset_output_grad += batch_idx * frame_size; + } + + T r_reset_gate_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T r_reset_output_grad = 0; + T r_update_gate_value = gate_value[frame_idx + frame_size * 0]; + T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0]; + T r_reset_gate_value = gate_value[frame_idx + frame_size * 1]; + + if (prev_out_value && prev_out_grad) { + if (is_batch) prev_out_value += batch_idx * frame_size; + if (is_batch) prev_out_grad += batch_idx * frame_size; + r_prev_out_value = prev_out_value[frame_idx]; + r_prev_out_grad = prev_out_grad[frame_idx]; + r_reset_output_grad = reset_output_grad[frame_idx]; + } + + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); + + gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; + gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad; + if (prev_out_grad) { + prev_out_grad[frame_idx] = r_prev_out_grad; + } +} +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..705787e2ff76630fbcb23e646c91f74fa2feea24 --- /dev/null +++ b/paddle/fluid/operators/math/detail/gru_kernel.h @@ -0,0 +1,163 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/hostdevice.h" + +#include + +// TODO(guosheng): refine code style in gru_kernel +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +namespace forward { + +template +class gru_resetOutput { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate, + T &prev_out, T &value_reset_output, + ActivationType act_gate) { + value_update_gate = activation(value_update_gate, act_gate); + value_reset_gate = activation(value_reset_gate, act_gate); + value_reset_output = prev_out * value_reset_gate; + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &value_reset_gate, __m256 &prev_out, + __m256 &value_reset_output, + ActivationType act_gate) { + value_update_gate = activation(value_update_gate, act_gate); + value_reset_gate = activation(value_reset_gate, act_gate); + value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate); + } +#endif +#endif +}; + +template +class gru_finalOutput { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state, + T &prev_out, T &value_output, + ActivationType act_input) { + value_frame_state = activation(value_frame_state, act_input); + value_output = prev_out - (value_update_gate * prev_out) + + (value_update_gate * value_frame_state); + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &value_frame_state, __m256 &prev_out, + __m256 &value_output, ActivationType act_input) { + value_frame_state = activation(value_frame_state, act_input); + value_output = _mm256_add_ps( + _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)), + _mm256_mul_ps(value_update_gate, value_frame_state)); + } +#endif +#endif +}; +} // namespace forward + +namespace backward { + +template +class gru_stateGrad { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, + T &value_frame_state, T &grad_frame_state, + T &value_prev_out, T &grad_prev_out, + T &grad_output, ActivationType act_input) { + grad_update_gate = (grad_output * value_frame_state); + grad_update_gate -= (grad_output * value_prev_out); + grad_prev_out -= (grad_output * value_update_gate); + grad_prev_out += grad_output; + grad_frame_state = activation(grad_output * value_update_gate, + value_frame_state, act_input); + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &grad_update_gate, + __m256 &value_frame_state, + __m256 &grad_frame_state, __m256 &value_prev_out, + __m256 &grad_prev_out, __m256 &grad_output, + ActivationType act_input) { + grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state); + grad_update_gate = _mm256_sub_ps( + grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out)); + grad_prev_out = _mm256_add_ps( + _mm256_sub_ps(grad_prev_out, + _mm256_mul_ps(grad_output, value_update_gate)), + grad_output); + grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate), + value_frame_state, act_input); + } +#endif +#endif +}; + +template +class gru_resetGrad { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, + T &value_reset_gate, T &grad_reset_gate, + T &value_prev_out, T &grad_prev_out, + T &grad_reset_output, ActivationType act_gate) { + grad_reset_gate = (grad_reset_output * value_prev_out); + grad_prev_out += (grad_reset_output * value_reset_gate); + grad_update_gate = + activation(grad_update_gate, value_update_gate, act_gate); + grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &grad_update_gate, __m256 &value_reset_gate, + __m256 &grad_reset_gate, __m256 &value_prev_out, + __m256 &grad_prev_out, __m256 &grad_reset_output, + ActivationType act_gate) { + grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out); + grad_prev_out = _mm256_add_ps( + grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate)); + grad_update_gate = + activation(grad_update_gate, value_update_gate, act_gate); + grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); + } +#endif +#endif +}; + +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bf26509ba17774f55c4f7592ff3afb7bcfcaa336 --- /dev/null +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -0,0 +1,312 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#ifndef __NVCC__ + +template +void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_checkI; + T r_checkF; + T r_checkO; + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, + active_gate, active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + value.state_value[i] = r_state; + value.state_active_value[i] = r_state_atv; + value.output_value[i] = r_out; + } +} + +template +void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI; + T r_checkF; + T r_checkO; + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + T *grad_in = grad.gate_grad; + T *grad_ig = grad.gate_grad + frame_size; + T *grad_fg = grad.gate_grad + frame_size * 2; + T *grad_og = grad.gate_grad + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + r_state = value.state_value[i]; + r_state_atv = value.state_active_value[i]; + r_output_grad = grad.output_grad[i]; + r_state_grad = grad.state_grad[i]; + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + grad.state_grad[i] = r_state_grad; + + if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad; + } + if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad; + } +} + +template +void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { +#ifdef __AVX__ + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_state; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_state_atv; + __m256 r_out; + + __m256 *value_in = (__m256 *)value.gate_value; + __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); + __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); + __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = ((__m256 *)value.check_ig)[i]; + r_checkF = ((__m256 *)value.check_fg)[i]; + r_checkO = ((__m256 *)value.check_og)[i]; + } + + if (value.prev_state_value) { + r_prev_state = ((__m256 *)value.prev_state_value)[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, + active_gate, active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + ((__m256 *)value.state_value)[i] = r_state; + ((__m256 *)value.state_active_value)[i] = r_state_atv; + ((__m256 *)value.output_value)[i] = r_out; + } +#endif +} + +template +void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { +#ifdef __AVX__ + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_grad_in; + __m256 r_grad_ig; + __m256 r_grad_fg; + __m256 r_grad_og; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_prev_state_grad; + __m256 r_state_grad; + __m256 r_state; + __m256 r_state_atv; + __m256 r_output_grad; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_checkIGrad; + __m256 r_checkFGrad; + __m256 r_checkOGrad; + + __m256 *value_in = (__m256 *)value.gate_value; + __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); + __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); + __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + __m256 *grad_in = (__m256 *)grad.gate_grad; + __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); + __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); + __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = ((__m256 *)value.check_ig)[i]; + r_checkF = ((__m256 *)value.check_fg)[i]; + r_checkO = ((__m256 *)value.check_og)[i]; + } + r_state = ((__m256 *)value.state_value)[i]; + r_state_atv = ((__m256 *)value.state_active_value)[i]; + r_output_grad = ((__m256 *)grad.output_grad)[i]; + r_state_grad = ((__m256 *)grad.state_grad)[i]; + if (value.prev_state_value) { + r_prev_state = ((__m256 *)value.prev_state_value)[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + ((__m256 *)grad.state_grad)[i] = r_state_grad; + + if (grad.prev_state_grad) + ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; + if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; + } + if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; + } +#endif +} + +template +void cpu_lstm_forward(Op op, LstmMetaValue value, int frame_size, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_forward_one_sequence(op, value, frame_size, active_node, + active_gate, active_state); + } else { + naive_lstm_forward_one_sequence(op, value, frame_size, active_node, + active_gate, active_state); + } +} + +template +void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_backward_one_sequence(op, value, grad, frame_size, active_node, + active_gate, active_state); + } else { + naive_lstm_backward_one_sequence(op, value, grad, frame_size, + active_node, active_gate, active_state); + } +} + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7865d0c0ba12c6150d87ea22e9c597e90b57e1ba --- /dev/null +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/device_context.h" + +#include + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, + int batch_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + value.gate_value += batch_idx * frame_size * 4; + value.output_value += batch_idx * frame_size; + value.state_value += batch_idx * frame_size; + value.state_active_value += batch_idx * frame_size; + } + + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + + T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0; + T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0; + T r_checkO = value.check_og ? value.check_og[frame_idx] : 0; + + r_value_in = value.gate_value[frame_idx]; + r_value_ig = value.gate_value[frame_idx + frame_size]; + r_value_fg = value.gate_value[frame_idx + frame_size * 2]; + r_value_og = value.gate_value[frame_idx + frame_size * 3]; + + if (value.prev_state_value) { + if (is_batch) value.prev_state_value += batch_idx * frame_size; + r_prev_state = value.prev_state_value[frame_idx]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate, + active_state); + + value.gate_value[frame_idx] = r_value_in; + value.gate_value[frame_idx + frame_size] = r_value_ig; + value.gate_value[frame_idx + frame_size * 2] = r_value_fg; + value.gate_value[frame_idx + frame_size * 3] = r_value_og; + + value.state_value[frame_idx] = r_state; + value.state_active_value[frame_idx] = r_state_atv; + value.output_value[frame_idx] = r_out; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeLstmBackward(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, + int batch_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + value.gate_value += batch_idx * frame_size * 4; + value.state_value += batch_idx * frame_size; + value.state_active_value += batch_idx * frame_size; + grad.gate_grad += batch_idx * frame_size * 4; + grad.state_grad += batch_idx * frame_size; + grad.output_grad += batch_idx * frame_size; + } + + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0; + T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0; + T r_checkO = value.check_og ? value.check_og[frame_idx] : 0; + + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + r_value_in = value.gate_value[frame_idx]; + r_value_ig = value.gate_value[frame_idx + frame_size]; + r_value_fg = value.gate_value[frame_idx + frame_size * 2]; + r_value_og = value.gate_value[frame_idx + frame_size * 3]; + r_state = value.state_value[frame_idx]; + r_state_atv = value.state_active_value[frame_idx]; + r_output_grad = grad.output_grad[frame_idx]; + r_state_grad = grad.state_grad[frame_idx]; + + if (value.prev_state_value) { + if (is_batch) value.prev_state_value += batch_idx * frame_size; + r_prev_state = value.prev_state_value[frame_idx]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad.gate_grad[frame_idx] = r_grad_in; + grad.gate_grad[frame_idx + frame_size] = r_grad_ig; + grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg; + grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og; + grad.state_grad[frame_idx] = r_state_grad; + if (grad.prev_state_grad) { + if (is_batch) grad.prev_state_grad += batch_idx * frame_size; + grad.prev_state_grad[frame_idx] = r_prev_state_grad; + } + + if (is_batch) { + if (value.prev_state_value) { + if (grad.check_ig_grad) + paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx, + r_checkIGrad); + if (grad.check_fg_grad) + paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx, + r_checkFGrad); + } + if (grad.check_og_grad) + paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx, + r_checkOGrad); + } else { + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad; + } + if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad; + } +} + +template +void gpu_lstm_forward(const platform::DeviceContext& context, Op op, + LstmMetaValue value, int frame_size, int batch_size, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + /* frame_per_block = 32 batch_per_block = 32 */ + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + auto stream = + reinterpret_cast(context).stream(); + if (batch_size == 1) { + KeLstmForward<<>>( + op, value, frame_size, batch_size, active_node, active_gate, + active_state); + } else { + KeLstmForward<<>>( + op, value, frame_size, batch_size, active_node, active_gate, + active_state); + } +} + +template +void gpu_lstm_backward(const platform::DeviceContext& context, Op op, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + /* frame_per_block = 32 batch_per_block = 16 */ + threads = dim3(32, 16); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16); + } + + auto stream = + reinterpret_cast(context).stream(); + if (batch_size == 1) { + KeLstmBackward<<>>( + op, value, grad, frame_size, batch_size, active_node, active_gate, + active_state); + } else { + KeLstmBackward<<>>( + op, value, grad, frame_size, batch_size, active_node, active_gate, + active_state); + } +} + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..0679cc62ba91fce540d9f6e227729a96a1553173 --- /dev/null +++ b/paddle/fluid/operators/math/detail/lstm_kernel.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/hostdevice.h" + +#include + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +namespace forward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, + T &prev_state, T &state, T &state_atv, T &output, + T &checkI, T &checkF, T &checkO, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + value_in = activation(value_in, active_node); + value_ig = activation(value_ig + prev_state * checkI, active_gate); + value_fg = activation(value_fg + prev_state * checkF, active_gate); + state = value_in * value_ig + prev_state * value_fg; + value_og = activation(value_og + state * checkO, active_gate); + state_atv = activation(state, active_state); + output = value_og * state_atv; + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + + HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig, + __m256 &value_fg, __m256 &value_og, + __m256 &prev_state, __m256 &state, + __m256 &state_atv, __m256 &output, __m256 &checkI, + __m256 &checkF, __m256 &checkO, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + value_in = activation(value_in, active_node); + value_ig = + activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)), + active_gate); + value_fg = + activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)), + active_gate); + state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig), + _mm256_mul_ps(prev_state, value_fg)); + value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)), + active_gate); + state_atv = activation(state, active_state); + output = _mm256_mul_ps(value_og, state_atv); + } +#endif +#endif +}; + +} // namespace forward + +namespace backward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, + T &grad_in, T &grad_ig, T &grad_fg, T &grad_og, + T &prev_state, T &prev_state_grad, T &state, + T &state_grad, T &state_atv, T &output_grad, + T &checkI, T &checkF, T &checkO, T &checkIGrad, + T &checkFGrad, T &checkOGrad, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + grad_og = activation(output_grad * state_atv, value_og, active_gate); + state_grad += activation(output_grad * value_og, state_atv, active_state) + + grad_og * checkO; + grad_in = activation(state_grad * value_ig, value_in, active_node); + grad_ig = activation(state_grad * value_in, value_ig, active_gate); + grad_fg = activation(state_grad * prev_state, value_fg, active_gate); + prev_state_grad = + grad_ig * checkI + grad_fg * checkF + state_grad * value_fg; + checkIGrad = grad_ig * prev_state; + checkFGrad = grad_fg * prev_state; + checkOGrad = grad_og * state; + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + HOSTDEVICE void operator()( + __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og, + __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og, + __m256 &prev_state, __m256 &prev_state_grad, __m256 &state, + __m256 &state_grad, __m256 &state_atv, __m256 &output_grad, + __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, + __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node, + ActivationType active_gate, ActivationType active_state) { + grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og, + active_gate); + state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og), + state_atv, active_state), + state_grad); + state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad); + grad_in = + activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node); + grad_ig = + activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate); + grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg, + active_gate); + prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI), + _mm256_mul_ps(grad_fg, checkF)); + prev_state_grad = + _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad); + checkIGrad = _mm256_mul_ps(grad_ig, prev_state); + checkFGrad = _mm256_mul_ps(grad_fg, prev_state); + checkOGrad = _mm256_mul_ps(grad_og, state); + } +#endif +#endif +}; + +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detection_util.h b/paddle/fluid/operators/math/detection_util.h new file mode 100644 index 0000000000000000000000000000000000000000..13e5d406c11a10dc87533a2ca07d14f4684446f5 --- /dev/null +++ b/paddle/fluid/operators/math/detection_util.h @@ -0,0 +1,300 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +template +struct BBox { + BBox(T x_min, T y_min, T x_max, T y_max) + : x_min(x_min), + y_min(y_min), + x_max(x_max), + y_max(y_max), + is_difficult(false) {} + + BBox() {} + + T get_width() const { return x_max - x_min; } + + T get_height() const { return y_max - y_min; } + + T get_center_x() const { return (x_min + x_max) / 2; } + + T get_center_y() const { return (y_min + y_max) / 2; } + + T get_area() const { return get_width() * get_height(); } + + // coordinate of bounding box + T x_min; + T y_min; + T x_max; + T y_max; + // whether difficult object (e.g. object with heavy occlusion is difficult) + bool is_difficult; +}; +// KNCHW ==> NHWC +// template +template +void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec); +template +void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec); +template +BBox DecodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data); +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2); +template +bool SortScorePairDescend(const std::pair>& pair1, + const std::pair>& pair2); +template +T jaccard_overlap(const BBox& bbox1, const BBox& bbox2); + +template +void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices); +template +int GetDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices); +template +BBox ClipBBox(const BBox& bbox); +template +void GetDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data); +template +void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec) { + size_t out_offset = bbox_vec.size(); + bbox_vec.resize(bbox_vec.size() + num_bboxes); + for (size_t i = 0; i < num_bboxes; ++i) { + BBox bbox; + bbox.x_min = *(prior_data + i * 8); + bbox.y_min = *(prior_data + i * 8 + 1); + bbox.x_max = *(prior_data + i * 8 + 2); + bbox.y_max = *(prior_data + i * 8 + 3); + bbox_vec[out_offset + i] = bbox; + } +} +template +void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec) { + size_t out_offset = var_vec.size(); + var_vec.resize(var_vec.size() + num); + for (size_t i = 0; i < num; ++i) { + std::vector var; + var.push_back(*(prior_data + i * 8 + 4)); + var.push_back(*(prior_data + i * 8 + 5)); + var.push_back(*(prior_data + i * 8 + 6)); + var.push_back(*(prior_data + i * 8 + 7)); + var_vec[out_offset + i] = var; + } +} +template +BBox DecodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data) { + T prior_bbox_width = prior_bbox.get_width(); + T prior_bbox_height = prior_bbox.get_height(); + T prior_bbox_center_x = prior_bbox.get_center_x(); + T prior_bbox_center_y = prior_bbox.get_center_y(); + + T decoded_bbox_center_x = + prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width + + prior_bbox_center_x; + T decoded_bbox_center_y = + prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height + + prior_bbox_center_y; + T decoded_bbox_width = + std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width; + T decoded_bbox_height = + std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height; + + BBox decoded_bbox; + decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2; + decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2; + decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2; + decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2; + + return decoded_bbox; +} +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} +template +T jaccard_overlap(const BBox& bbox1, const BBox& bbox2) { + if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || + bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { + return 0.0; + } else { + T inter_x_min = std::max(bbox1.x_min, bbox2.x_min); + T inter_y_min = std::max(bbox1.y_min, bbox2.y_min); + T interX_max = std::min(bbox1.x_max, bbox2.x_max); + T interY_max = std::min(bbox1.y_max, bbox2.y_max); + + T inter_width = interX_max - inter_x_min; + T inter_height = interY_max - inter_y_min; + T inter_area = inter_width * inter_height; + + T bbox_area1 = bbox1.get_area(); + T bbox_area2 = bbox2.get_area(); + + return inter_area / (bbox_area1 + bbox_area2 - inter_area); + } +} + +template +void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices) { + std::vector> scores; + for (size_t i = 0; i < num_priors; ++i) { + size_t conf_offset = i * num_classes + class_idx; + if (conf_score_data[conf_offset] > conf_threshold) + scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); + } + std::stable_sort(scores.begin(), scores.end(), + SortScorePairDescend); + if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); + while (scores.size() > 0) { + const size_t idx = scores.front().second; + bool keep = true; + for (size_t i = 0; i < indices->size(); ++i) { + if (keep) { + const size_t saved_idx = (*indices)[i]; + T overlap = jaccard_overlap(bboxes[idx], bboxes[saved_idx]); + keep = overlap <= nms_threshold; + } else { + break; + } + } + if (keep) indices->push_back(idx); + scores.erase(scores.begin()); + } +} +template +int GetDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices) { + int total_keep_num = 0; + for (size_t n = 0; n < batch_size; ++n) { + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + size_t num_detected = 0; + std::map> indices; + size_t conf_offset = n * num_priors * num_classes; + for (size_t c = 0; c < num_classes; ++c) { + if (c == background_label_id) continue; + ApplyNmsFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, + conf_threshold, nms_threshold, num_priors, num_classes, + &(indices[c])); + num_detected += indices[c].size(); + } + if (top_k > 0 && num_detected > top_k) { + // std::vector> score_index_pairs; + std::vector>> score_index_pairs; + for (size_t c = 0; c < num_classes; ++c) { + const std::vector& label_indices = indices[c]; + for (size_t i = 0; i < label_indices.size(); ++i) { + size_t idx = label_indices[i]; + score_index_pairs.push_back( + std::make_pair((conf_data + conf_offset)[idx * num_classes + c], + std::make_pair(c, idx))); + } + } + std::sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(top_k); + std::map> new_indices; + for (size_t i = 0; i < score_index_pairs.size(); ++i) { + size_t label = score_index_pairs[i].second.first; + size_t idx = score_index_pairs[i].second.second; + new_indices[label].push_back(idx); + } + all_detection_indices->push_back(new_indices); + total_keep_num += top_k; + } else { + all_detection_indices->push_back(indices); + total_keep_num += num_detected; + } + } + return total_keep_num; +} +template +BBox ClipBBox(const BBox& bbox) { + T one = static_cast(1.0); + T zero = static_cast(0.0); + BBox clipped_bbox; + clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero); + clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero); + clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero); + clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero); + return clipped_bbox; +} +template +void GetDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data) { + size_t count = 0; + for (size_t n = 0; n < batch_size; ++n) { + for (std::map>::const_iterator it = + all_indices[n].begin(); + it != all_indices[n].end(); ++it) { + size_t label = it->first; + const std::vector& indices = it->second; + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + size_t conf_offset = n * num_priors * num_classes + idx * num_classes; + out_data[count * 7] = n; + out_data[count * 7 + 1] = label; + out_data[count * 7 + 2] = (conf_data + conf_offset)[label]; + BBox clipped_bbox = ClipBBox(decoded_bboxes[idx]); + out_data[count * 7 + 3] = clipped_bbox.x_min; + out_data[count * 7 + 4] = clipped_bbox.y_min; + out_data[count * 7 + 5] = clipped_bbox.x_max; + out_data[count * 7 + 6] = clipped_bbox.y_max; + ++count; + } + } + } +} +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..100318041679e38b37fd1ef1f071d4e682889756 --- /dev/null +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct GRUUnitFunctor { + static void compute(const platform::CPUDeviceContext &context, + GRUMetaValue value, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { +#ifndef __NVCC__ + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, + 1, value.gate_value, frame_size * 3); + } + + detail::forward_reset_output(detail::forward::gru_resetOutput(), value, + frame_size, batch_size, active_gate); + + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, frame_size, + 1, value.gate_value + frame_size * 2, frame_size * 3); + } + + detail::forward_final_output(detail::forward::gru_finalOutput(), value, + frame_size, batch_size, active_node); +#endif + } +}; + +template +struct GRUUnitGradFunctor { + static void compute(const platform::CPUDeviceContext &context, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { +#ifndef __NVCC__ + detail::backward_state_grad(detail::backward::gru_stateGrad(), value, + grad, frame_size, batch_size, active_node); + + if (value.prev_out_value && grad.prev_out_grad) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size, 1, + grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, + frame_size, 0, grad.reset_output_grad, frame_size); + + if (grad.state_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size, batch_size, 1, + value.reset_output_value, frame_size, + grad.gate_grad + frame_size * 2, frame_size * 3, 1, + grad.state_weight_grad, frame_size); + } + } + + detail::backward_reset_grad(detail::backward::gru_resetGrad(), value, + grad, frame_size, batch_size, active_gate); + + if (grad.prev_out_grad && value.prev_out_value) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size * 2, 1, + grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, + grad.prev_out_grad, frame_size); + + if (grad.gate_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size * 2, batch_size, 1, + value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, + grad.gate_weight_grad, frame_size * 2); + } + } +#endif + } +}; + +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d5d5d7a743150c397a5b8356d1b3add88c509b6 --- /dev/null +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct GRUUnitFunctor { + static void compute(const platform::CUDADeviceContext &context, + GRUMetaValue value, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { + auto stream = context.stream(); + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, + 1, value.gate_value, frame_size * 3); + } + + if (batch_size == 1) { + detail::KeGruForwardResetOutput, + /* is_batch= */ false, + T><<>>( + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); + } else { + detail::KeGruForwardResetOutput, + /* is_batch= */ true, + T><<>>( + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); + } + + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, frame_size, + 1, value.gate_value + frame_size * 2, frame_size * 3); + } + + if (batch_size == 1) { + detail::KeGruForwardFinalOutput, + /* is_batch= */ false, + T><<>>( + detail::forward::gru_finalOutput(), value.gate_value, + value.prev_out_value, value.output_value, frame_size, batch_size, + active_node); + } else { + detail::KeGruForwardFinalOutput, + /* is_batch= */ true, + T><<>>( + detail::forward::gru_finalOutput(), value.gate_value, + value.prev_out_value, value.output_value, frame_size, batch_size, + active_node); + } + } +}; + +template +struct GRUUnitGradFunctor { + static void compute(const platform::CUDADeviceContext &context, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { + auto stream = context.stream(); + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (batch_size == 1) { + detail::KeGruBackwardStateGrad< + detail::backward::gru_stateGrad, + /* is_batch= */ false><<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node); + } else { + detail::KeGruBackwardStateGrad< + detail::backward::gru_stateGrad, + /* is_batch= */ true><<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node); + } + + if (value.prev_out_value && grad.prev_out_grad) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size, 1, + grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, + frame_size, 0, grad.reset_output_grad, frame_size); + + if (grad.state_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size, batch_size, 1, + value.reset_output_value, frame_size, + grad.gate_grad + frame_size * 2, frame_size * 3, 1, + grad.state_weight_grad, frame_size); + } + } + + if (batch_size == 1) { + detail::KeGruBackwardResetGrad< + detail::backward::gru_resetGrad, + /* is_batch= */ false><<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); + } else { + detail::KeGruBackwardResetGrad< + detail::backward::gru_resetGrad, + /* is_batch= */ true><<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); + } + + if (grad.prev_out_grad && value.prev_out_value) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size * 2, 1, + grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, + grad.prev_out_grad, frame_size); + + if (grad.gate_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size * 2, batch_size, 1, + value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, + grad.gate_weight_grad, frame_size * 2); + } + } + } +}; + +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..93e19cf55782facfd95affdc77dcf78c511d8bbd --- /dev/null +++ b/paddle/fluid/operators/math/gru_compute.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct GRUMetaValue { + T *gate_weight; + T *state_weight; + T *gate_value; + T *reset_output_value; + T *output_value; + T *prev_out_value; +}; + +template +struct GRUMetaGrad { + T *gate_weight_grad; + T *state_weight_grad; + T *gate_grad; + T *reset_output_grad; + T *output_grad; + T *prev_out_grad; +}; + +template +struct GRUUnitFunctor { + static void compute(const DeviceContext &context, GRUMetaValue value, + int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate); +}; + +template +struct GRUUnitGradFunctor { + static void compute(const DeviceContext &context, GRUMetaValue value, + GRUMetaGrad grad, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc new file mode 100644 index 0000000000000000000000000000000000000000..c298b00bb4cb9df8f9a54b4420edb07aed9cf891 --- /dev/null +++ b/paddle/fluid/operators/math/im2col.cc @@ -0,0 +1,313 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/im2col.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int col_height = col->dims()[3]; + int col_width = col->dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + ((dilation[0] * (filter_height - 1) + 1))) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + ((dilation[1] * (filter_width - 1) + 1))) / + stride[1] + + 1, + col_width, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + + int channels_col = im_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < col_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < col_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * col_height + h) * col_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } + } + } + } +}; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int col_height = col.dims()[3]; + int col_width = col.dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + ((dilation[0] * (filter_height - 1) + 1))) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + ((dilation[1] * (filter_width - 1) + 1))) / + stride[1] + + 1, + col_width, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + + int channels_col = im_channels * filter_height * filter_width; + + T* im_data = im->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < col_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < col_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + if ((im_row_idx) >= 0 && (im_row_idx) < im_height && + (im_col_idx) >= 0 && (im_col_idx) < im_width) { + im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] += + col_data[(c * col_height + h) * col_width + w]; + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[3]; + int filter_width = col->dims()[4]; + int col_height = col->dims()[0]; + int col_width = col->dims()[1]; + + PADDLE_ENFORCE_EQ( + (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ( + (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + const T* im_data = im.data(); + T* col_data = col->data(); + + for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { + for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { + for (int channel = 0; channel < im_channels; ++channel) { + for (int filter_row_idx = 0; filter_row_idx < filter_height; + ++filter_row_idx) { + int im_row_offset = + col_row_idx * stride[0] + filter_row_idx - padding[0]; + for (int filter_col_idx = 0; filter_col_idx < filter_width; + ++filter_col_idx) { + int im_col_offset = + col_col_idx * stride[1] + filter_col_idx - padding[1]; + + int col_offset = + ((((col_row_idx)*col_width + col_col_idx) * im_channels + + channel) * + filter_height + + filter_row_idx) * + filter_width + + filter_col_idx; + + int im_offset = (channel * im_height + im_row_offset) * im_width + + im_col_offset; + col_data[col_offset] = + (im_row_offset < 0 || im_row_offset >= im_height || + im_col_offset < 0 || im_col_offset >= im_width) + ? static_cast(0) + : im_data[im_offset]; + } + } + } + } + } + } +}; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int col_height = col.dims()[0]; + int col_width = col.dims()[1]; + + PADDLE_ENFORCE_EQ( + (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ( + (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + T* im_data = im->data(); + const T* col_data = col.data(); + + for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { + for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { + for (int channel = 0; channel < im_channels; ++channel) { + for (int filter_row_idx = 0; filter_row_idx < filter_height; + ++filter_row_idx) { + int im_row_offset = + col_row_idx * stride[0] + filter_row_idx - padding[0]; + for (int filter_col_idx = 0; filter_col_idx < filter_width; + ++filter_col_idx) { + int im_col_offset = + col_col_idx * stride[1] + filter_col_idx - padding[1]; + + int col_offset = + (((col_row_idx * col_width + col_col_idx) * im_channels + + channel) * + filter_height + + filter_row_idx) * + filter_width + + filter_col_idx; + + if (im_row_offset >= 0 && im_row_offset < im_height && + im_col_offset >= 0 && im_col_offset < im_width) { + int im_offset = + (channel * im_height + im_row_offset) * im_width + + im_col_offset; + im_data[im_offset] += col_data[col_offset]; + } + } + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu new file mode 100644 index 0000000000000000000000000000000000000000..c26343aacf524c4381a8bba1e4e0d1a07bee6d6e --- /dev/null +++ b/paddle/fluid/operators/math/im2col.cu @@ -0,0 +1,424 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void im2col(const T* data_im, int num_outs, int im_height, + int im_width, int dilation_h, int dilation_w, + int filter_height, int filter_width, int stride_height, + int stride_width, int padding_height, int padding_width, + int col_height, int col_width, T* data_col) { + const int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < num_outs) { + int w_out = index % col_width; + int h_out = (index / col_width) % col_height; + int channel_in = index / col_width / col_height; + int channel_out = channel_in * filter_height * filter_width; + int h_in = h_out * stride_height - padding_height; + int w_in = w_out * stride_width - padding_width; + + data_col += (channel_out * col_height + h_out) * col_width + w_out; + data_im += (channel_in * im_height + h_in) * im_width + w_in; + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + int rIdx = h_in + i * dilation_h; + int cIdx = w_in + j * dilation_w; + *data_col = + (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0) + ? 0 + : data_im[i * dilation_h * im_width + j * dilation_w]; + data_col += col_height * col_width; + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int col_height = col->dims()[3]; + int col_width = col->dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + int num_outputs = im_channels * col_height * col_width; + int blocks = (num_outputs + 1024 - 1) / 1024; + int block_x = 512; + int block_y = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(block_x, block_y); + im2col<<>>( + im.data(), num_outputs, im_height, im_width, dilation[0], + dilation[1], filter_height, filter_width, stride[0], stride[1], + padding[0], padding[1], col_height, col_width, col->data()); + } +}; + +template +__global__ void col2im(int n, const T* data_col, int im_height, int im_width, + int dilation_h, int dilation_w, int filter_height, + int filter_width, int stride_height, int stride_width, + int padding_height, int padding_width, int col_height, + int col_width, T* data_im) { + const int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + const int d_filter_height = dilation_h * (filter_height - 1) + 1; + const int d_filter_width = dilation_w * (filter_width - 1) + 1; + + if (index < n) { + T val = 0; + int w = index % im_width + padding_width; + int h = (index / im_width) % im_height + padding_height; + int c = index / (im_width * im_height); + + // compute the start and end of the output + int w_col_start = + (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1; + int w_col_end = min(w / stride_width + 1, col_width); + int h_col_start = + (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1; + int h_col_end = min(h / stride_height + 1, col_height); + + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + int h_off = (h - h_col * stride_height); + int w_off = (w - w_col * stride_width); + if (h_off % dilation_h == 0 && w_off % dilation_w == 0) { + h_off /= dilation_h; + w_off /= dilation_w; + int data_col_index = + (((c * filter_height + h_off) * filter_width + w_off) * + col_height + + h_col) * + col_width + + w_col; + + val += data_col[data_col_index]; + } + } + } + data_im[index] = val; + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int col_height = col.dims()[3]; + int col_width = col.dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + size_t num_kernels = im_channels * im_height * im_width; + + size_t blocks = (num_kernels + 1024 - 1) / 1024; + size_t block_x = 512; + size_t block_y = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(block_x, block_y); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im<<>>( + num_kernels, col.data(), im_height, im_width, dilation[0], + dilation[1], filter_height, filter_width, stride[0], stride[1], + padding[0], padding[2], col_height, col_width, im->data()); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +template +__global__ void im2colOCF(const T* im_data, int im_channels, int im_height, + int im_width, int filter_height, int filter_width, + int stride_height, int stride_width, + int padding_height, int padding_width, int col_height, + int col_width, T* col_data) { + int swid = blockIdx.x; + int shid = blockIdx.y; + for (int channelid = threadIdx.z; channelid < im_channels; + channelid += blockDim.z) { + for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { + int width_offset = idx + swid * stride_width - padding_width; + int height_offset = idy + shid * stride_height - padding_height; + int im_offset = width_offset + height_offset * im_width + + channelid * im_height * im_width; + + int col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * col_width + swid) * + (im_channels * filter_height * filter_width); + + col_data[col_offset] = + (height_offset >= im_height || height_offset < 0 || + width_offset >= im_width || width_offset < 0) + ? T(0) + : im_data[im_offset]; + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[3]; + int filter_width = col->dims()[4]; + int col_height = col->dims()[0]; + int col_width = col->dims()[1]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + int block_dim_x = 0; + int block_dim_y = 0; + if (filter_height <= 4 && filter_width <= 4) { + block_dim_x = 4; + block_dim_y = 4; + } else if (filter_height <= 8 && filter_width <= 8) { + block_dim_x = 8; + block_dim_y = 8; + } else if (filter_height <= 16 && filter_width <= 16) { + block_dim_x = 16; + block_dim_y = 16; + } else { + block_dim_x = 32; + block_dim_y = 32; + } + + int block_dim_z = 1024 / block_dim_x / block_dim_y; + dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels)); + dim3 grid(col_width, col_height); + im2colOCF<<>>( + im.data(), im_channels, im_height, im_width, filter_height, + filter_width, stride[0], stride[1], padding[0], padding[1], col_height, + col_width, col->data()); + } +}; + +template +__global__ void col2imOCF(const T* col_data, int im_channels, int im_height, + int im_width, int filter_height, int filter_width, + int stride_height, int stride_width, + int padding_height, int padding_width, int col_height, + int col_width, T* im_data) { + int swid = blockIdx.x; + int shid = blockIdx.y; + for (int channelid = threadIdx.z; channelid < im_channels; + channelid += blockDim.z) { + for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { + int width_offset = idx + swid * stride_width - padding_width; + int height_offset = idy + shid * stride_height - padding_height; + int im_offset = width_offset + height_offset * im_width + + channelid * im_height * im_width; + + int col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * col_width + swid) * + (im_channels * filter_height * filter_width); + + if (height_offset >= 0 && height_offset < im_height && + width_offset >= 0 && width_offset < im_width) { + paddle::platform::CudaAtomicAdd(im_data + im_offset, + col_data[col_offset]); + } + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int col_height = col.dims()[0]; + int col_width = col.dims()[1]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + int block_dim_x = 0; + int block_dim_y = 0; + if (filter_height <= 4 && filter_width <= 4) { + block_dim_x = 4; + block_dim_y = 4; + } else if (filter_height <= 8 && filter_width <= 8) { + block_dim_x = 8; + block_dim_y = 8; + } else if (filter_height <= 16 && filter_width <= 16) { + block_dim_x = 16; + block_dim_y = 16; + } else { + block_dim_x = 32; + block_dim_y = 32; + } + + int block_dim_z = 1024 / block_dim_x / block_dim_y; + dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels)); + dim3 grid(col_width, col_height); + col2imOCF<<>>( + col.data(), im_channels, im_height, im_width, filter_height, + filter_width, stride[0], stride[1], padding[0], padding[1], col_height, + col_width, im->data()); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h new file mode 100644 index 0000000000000000000000000000000000000000..525c0f5dda102ef92a9d79832fecc10f99ccd900 --- /dev/null +++ b/paddle/fluid/operators/math/im2col.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ +enum class ColFormat { kCFO = 0, kOCF = 1 }; + +/* + * \brief Converts the image data of three dimensions(CHW) into a colData of + * five dimensions in the Im2ColFunctor calculation, + * And in the Col2ImFunctor calculation, it is reversed. + * + * \param imData Image data. + * \param imShape The shape of imData, + * [input_channels, input_height, input_width]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * \param dilations dilation data. + * \param 2-dimension [dilation_height, dilation_width]. + * + * \param strides stride data. + * \param 2-dimension [stride_height, stride_width]. + * + * \param paddings padding data. + * \param 4-dimension [up_pad, left_pad, down_pad, right_pad]. + * + * If the template argument Format is kCFO, the shape of colData is: + * [input_channels, filter_height, filter_width, output_height, output_width] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * input_channels * filter_height * filter_width, and the width is equal + * output_height * output_width. + * + * Reshape: + * shape of colData shape of convolution matrix + * [input_channels, + * filter_height, + * filter_width, ======> [height, width] + * output_height, + * output_width] + * + * If the template argument Format is kOCF, the shape of colData is: + * [output_height, output_width, input_channels, filter_height, filter_width] + * So, it is easy to reshape into a sequence matrix for rnn calculation. + * The shape of sequence matrix is [seq_length, step_size], where the seq_length + * is equal output_height * output_width, and the step_size is equal + * input_channels * filter_height * filter_width. + * + * Reshape: + * shape of colData shape of sequence matrix + * [output_height, + * output_width, + * input_channels, ======> [seqLength, stepSize] + * filter_height, + * filter_width] + * + * \note The caller needs to ensure that imShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Im2ColFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col); +}; + +template +class Col2ImFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..59d6a84b892fc32aceb5622a856a5c648a3ade5b --- /dev/null +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/im2col.h" +#include + +template +void testIm2col() { + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor input; + paddle::framework::Tensor output_cfo; + paddle::framework::Tensor output_ocf; + paddle::framework::Tensor output_tmp; + + /** + * input = [0, 1, 2, + * 3, 4, 5] + * + * output_cfo = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5] + * + * output_ocf = [0, 1, 3, 4 + * 1, 2, 4, 5] + * + * col2im_cfo = [0, 2, 2 + * 3, 4, 5] + * + * col2im_ocf = [0, 2, 2 + * 3, 4, 5] + */ + int input_height = 2; + int input_width = 3; + int filter_size = 2; + std::vector stride({1, 1}); // stride_y, stride_x + std::vector padding( + {0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad + std::vector dilation({1, 1}); // dilation_y, dilation_x + int output_height = + (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1; + int output_width = + (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1; + float* input_ptr = input_tmp.mutable_data( + {1, input_height, input_width}, paddle::platform::CPUPlace()); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input_ptr, arr, 6 * sizeof(float)); + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + output_cfo.mutable_data( + {1, filter_size, filter_size, output_height, output_width}, *place); + output_ocf.mutable_data( + {output_height, output_width, 1, filter_size, filter_size}, *place); + + // Im2Col + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> + im2col; + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> + im2col_ocf; + + im2col(*context, input, dilation, stride, padding, &output_cfo); + im2col_ocf(*context, input, dilation, stride, padding, &output_ocf); + + float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5}; + float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5}; + + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output_cfo.data(); + } else { + Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); + out_cfo_ptr = output_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]); + } + + float* out_ocf_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_ocf_ptr = output_ocf.data(); + } else { + Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); + out_ocf_ptr = output_tmp.data(); + } + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); + } + + // Col2Im: kCFO + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> + col2im; + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> + col2im_ocf; + float col2im_data[] = {0, 2, 2, 3, 8, 5}; + + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + + col2im(*context, output_cfo, dilation, stride, padding, &input); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + // Col2Im: kOCF + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + + col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); + + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + delete place; + delete context; +} + +TEST(math, im2col) { + testIm2col(); +#ifdef PADDLE_WITH_CUDA + testIm2col(); +#endif +} diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..09eb89ec58d107f547a5c83908a9d2a541aa95f4 --- /dev/null +++ b/paddle/fluid/operators/math/lstm_compute.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/lstm_kernel.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const platform::CPUDeviceContext& context, + LstmMetaValue value, int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_forward(detail::forward::lstm(), value, frame_size, + cand_act, gate_act, cell_act); + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + } + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const platform::CPUDeviceContext& context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_backward(detail::backward::lstm(), value, grad, + frame_size, cand_act, gate_act, cell_act); + + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + + grad.gate_grad += frame_size * 4; + grad.state_grad += frame_size; + grad.state_active_grad += frame_size; + grad.output_grad += frame_size; + if (grad.prev_state_grad) { + grad.prev_state_grad += frame_size; + } + } + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..adedee28bd010c611b9c6901c55bf67e73d3639b --- /dev/null +++ b/paddle/fluid/operators/math/lstm_compute.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h" +#include "paddle/fluid/operators/math/detail/lstm_kernel.h" +#include "paddle/fluid/operators/math/lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const platform::CUDADeviceContext& context, + LstmMetaValue value, int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + detail::gpu_lstm_forward(context, detail::forward::lstm(), value, + frame_size, batch_size, cand_act, gate_act, + cell_act); + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const platform::CUDADeviceContext& context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, + frame_size, batch_size, cand_act, gate_act, + cell_act); + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8610e96cf1abb0e5a64cd60c6bab3a8c08754587 --- /dev/null +++ b/paddle/fluid/operators/math/lstm_compute.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmMetaValue { + T *gate_value; + T *prev_state_value; + T *state_value; + T *state_active_value; + T *output_value; + T *check_ig; + T *check_fg; + T *check_og; +}; + +template +struct LstmMetaGrad { + T *gate_grad; + T *prev_state_grad; + T *state_grad; + T *state_active_grad; + T *output_grad; + T *check_ig_grad; + T *check_fg_grad; + T *check_og_grad; +}; + +template +class LstmUnitFunctor { + public: + static void compute(const DeviceContext &context, LstmMetaValue value, + int frame_size, int batch_size, + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); +}; + +template +class LstmUnitGradFunctor { + public: + static void compute(const DeviceContext &context, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, int batch_size, + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc new file mode 100644 index 0000000000000000000000000000000000000000..2636dbddde67955a99663aa93df1425b9e1ec2ce --- /dev/null +++ b/paddle/fluid/operators/math/math_function.cc @@ -0,0 +1,342 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { + cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); +} + +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { + cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); +} + +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && + platform::is_cpu_place(matrix_b.place()) && + platform::is_cpu_place(matrix_out->place()), + "Matrix must all be in CPUPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && + platform::is_cpu_place(matrix_b.place()) && + platform::is_cpu_place(matrix_out->place()), + "Matrix must all be in CPUPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +#ifdef PADDLE_WITH_MKLML +// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, + a_array.data(), &lda, b_array.data(), &ldb, &beta, + c_array.data(), &ldc, 1 /* group_count */, &batchCount); +} + +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, + a_array.data(), &lda, b_array.data(), &ldb, &beta, + c_array.data(), &ldc, 1 /* group_count */, &batchCount); +} +#else +// The below is a naive but correct serial implementation that just loops +// over the batch dimension. This is a fallback for when the batched gemm +// functions of Intel MKL are not available. In the future, this computation +// should be parallelized. +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + for (int k = 0; k < batchCount; ++k) { + const float* Ak = &A[k * strideA]; + const float* Bk = &B[k * strideB]; + float* Ck = &C[k * M * N]; + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); + } +} + +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + for (int k = 0; k < batchCount; ++k) { + const double* Ak = &A[k * strideA]; + const double* Bk = &B[k * strideB]; + double* Ck = &C[k * M * N]; + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); + } +} +#endif + +template <> +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); +} + +template <> +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); +} + +template <> +void axpy( + const platform::CPUDeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + cblas_saxpy(n, alpha, x, 1, y, 1); +} + +template <> +void axpy( + const platform::CPUDeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + cblas_daxpy(n, alpha, x, 1, y, 1); +} + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; + +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; + +DEFINE_CPU_TRANS(1); +DEFINE_CPU_TRANS(2); +DEFINE_CPU_TRANS(3); +DEFINE_CPU_TRANS(4); +DEFINE_CPU_TRANS(5); +DEFINE_CPU_TRANS(6); + +struct TensorSetConstantCPU { + TensorSetConstantCPU(framework::Tensor* tensor, float value) + : tensor_(tensor), value_(value) {} + template + void operator()() const { + auto cpu = platform::CPUPlace(); + auto* begin = tensor_->mutable_data(cpu); + std::fill(begin, begin + tensor_->numel(), static_cast(value_)); + } + framework::Tensor* tensor_; + float value_; +}; + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(framework::ToDataType(tensor->type()), + TensorSetConstantCPU(tensor, value)); +} + +struct TensorSetConstantWithPlace : public boost::static_visitor { + TensorSetConstantWithPlace(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) + : context_(context), tensor_(tensor), value_(value) {} + + template + void operator()(Place place) const { + set_constant_with_place(context_, tensor_, value_); + } + + const platform::DeviceContext& context_; + framework::Tensor* tensor_; + float value_; +}; + +void set_constant(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) { + TensorSetConstantWithPlace func(context, tensor, value); +#ifdef PADDLE_WITH_CUDA + tensor->place().apply_visitor(func); +#else + func(platform::CPUPlace()); +#endif +} + +template +struct RowwiseAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(vector); + auto out = framework::EigenMatrix::From(*output); + + for (int64_t i = 0; i < in_dims[0]; ++i) { + out.chip(i, 0) = in.chip(i, 0) + vec; + } + } +}; + +template struct RowwiseAdd; +template struct RowwiseAdd; + +template struct ColwiseSum; +template struct ColwiseSum; + +template struct RowwiseSum; +template struct RowwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu new file mode 100644 index 0000000000000000000000000000000000000000..5764da71c8491e27493774569bb663f6a6e835c3 --- /dev/null +++ b/paddle/fluid/operators/math/math_function.cu @@ -0,0 +1,355 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/math_function_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + PADDLE_ENFORCE(platform::dynload::cublasSgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); +} + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::cublasDgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); +} + +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::cublasSgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); +} + +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::cublasDgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); +} + +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +} + +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +} + +template <> +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { + cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + + PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); +} + +template <> +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { + cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); +} + +template <> +void axpy( + const platform::CUDADeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); +} + +template <> +void axpy( + const platform::CUDADeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); +} + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; + +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; + +DEFINE_GPU_TRANS(1); +DEFINE_GPU_TRANS(2); +DEFINE_GPU_TRANS(3); +DEFINE_GPU_TRANS(4); +DEFINE_GPU_TRANS(5); +DEFINE_GPU_TRANS(6); + +struct TensorSetConstantGPU { + TensorSetConstantGPU(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) + : context_(context), tensor_(tensor), value_(value) {} + + template + void operator()() const { + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, static_cast(value_)); + } + + const platform::DeviceContext& context_; + framework::Tensor* tensor_; + float value_; +}; + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(framework::ToDataType(tensor->type()), + TensorSetConstantGPU(context, tensor, value)); +} + +template +__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width, + int num) { + T tmp = 1.0 / width; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + int h = i * tmp; + int w = i - h * width; + c[i] = a[i] + b[w]; + } +} + +template +struct RowwiseAdd { + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + int blocks = 512; + int grids = (input.numel() + blocks - 1) / blocks; + RowwiseAddKernel<<>>( + input.data(), vector.data(), output->data(), + static_cast(in_dims[1]), static_cast(input.numel())); + } +}; + +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug mode, +// and only failed for this case. So reimplemented it. +template <> +void ColwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), size); + framework::Tensor one; + one.mutable_data({in_dims[0]}, context.GetPlace()); + SetConstant set; + set(context, &one, static_cast(1.0)); + gemv( + context, true, static_cast(in_dims[0]), static_cast(in_dims[1]), + 1.0, input.data(), one.data(), 0.0, + vector->data()); +} + +template struct RowwiseSum; +// template struct RowwiseSum; +// TODO(zcd): Following ColwiseSum format, need to confirm. +// The RowwiseSum failed in debug mode, +// and only failed for this case. So reimplemented it. +template <> +void RowwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]); + framework::Tensor one; + one.mutable_data({size}, context.GetPlace()); + SetConstant set; + set(context, &one, static_cast(1.0)); + gemv( + context, true, static_cast(in_dims[1]), static_cast(in_dims[0]), + 1.0, one.data(), input.data(), 0.0, + vector->data()); +} + +template struct RowwiseMean; +template struct RowwiseMean; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h new file mode 100644 index 0000000000000000000000000000000000000000..84916af1f8e5bd55920685b4897858f2da9fde92 --- /dev/null +++ b/paddle/fluid/operators/math/math_function.h @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MKLML +#include +#include +#include +#endif + +#ifdef PADDLE_USE_ATLAS +extern "C" { +#include +#include +} +#endif + +#ifdef PADDLE_USE_OPENBLAS +#include +#include +#endif + +#ifndef LAPACK_FOUND +extern "C" { +#include +int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda, + int* ipiv); +int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda, + int* ipiv); +int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda, + const int* ipiv); +int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda, + const int* ipiv); +} +#endif + +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +// Support continuous memory now +// If transA = N, and transB = N +// Then matrixA: M * K, matrixB: K * N, matrixC : M * N +// For more detailed info, please refer to +// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html +template +void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const T alpha, const T* A, const T* B, const T beta, T* C); + +// gemm wrapper with stride args for matrix uncontinuous in memory +template +void gemm(const DeviceContext& context, const bool transA, const bool transB, + const int M, const int N, const int K, const T alpha, const T* A, + const int lda, const T* B, const int ldb, const T beta, T* C, + const int ldc); + +// matrix multiply with continuous memory +template +void matmul(const DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, + T alpha, framework::Tensor* matrix_out, T beta); + +// Batched gemm +template +void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, + const int K, const T alpha, const T* A, const T* B, + const T beta, T* C, const int batchCount, const int strideA, + const int strideB); + +template +void gemv(const DeviceContext& context, const bool trans_a, const int M, + const int N, const T alpha, const T* A, const T* B, const T beta, + T* C); + +template +void axpy(const DeviceContext& context, const int n, const T alpha, const T* x, + T* y); + +template +struct Transpose { + void operator()(const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis); +}; + +template +struct SetConstant { + void operator()(const DeviceContext& context, framework::Tensor* tensor, + T num); +}; + +template +void set_constant_with_place(const platform::DeviceContext& context, + framework::Tensor* tensor, float value); + +void set_constant(const platform::DeviceContext& context, + framework::Tensor* tensor, float value); + +template +struct RowwiseAdd { + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& vec, framework::Tensor* output); +}; + +template +struct ColwiseSum { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +template +struct RowwiseSum { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +template +struct RowwiseMean { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..a55ed6c58bafafed1d58ff47fb433a9ae58b8261 --- /dev/null +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +void SetConstant::operator()(const DeviceContext& context, + framework::Tensor* tensor, + T num) { + auto t = framework::EigenVector::Flatten(*tensor); + t.device(*context.eigen_device()) = t.constant(static_cast(num)); +} + +template +void Transpose::operator()( + const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis) { + Eigen::array permute; + for (int i = 0; i < Rank; i++) { + permute[i] = axis[i]; + } + auto in_dim = in.dims(); + auto out_dim = out->dims(); + + auto eigen_in = framework::EigenTensor::From(in); + auto eigen_out = framework::EigenTensor::From(*out); + auto* dev = context.eigen_device(); + eigen_out.device(*dev) = eigen_in.shuffle(permute); +} + +template +void ColwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{0}})); +} + +// Specialize for CPU, since Eigen implement a general reduce. However, +// colwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class ColwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + for (size_t j = 0; j < static_cast(size); ++j) { + if (i == 0) { + out_buf[j] = in_buf[i * size + j]; + } else { + out_buf[j] += in_buf[i * size + j]; + } + } + } + } +}; + +template +void RowwiseMean::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.mean(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseMean { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), height); + auto inv_size = 1.0 / size; + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum * inv_size; + } + } +}; + +template +void RowwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum; + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6cd8e8b35abeb6d321bb075d7b88f0abd8f0fc59 --- /dev/null +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/operators/math/math_function.h" +#include "gtest/gtest.h" + +TEST(math_function, gemm_notrans_cblas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::gemm( + context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1, + input3_ptr + 1, 4); + + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); +} + +TEST(math_function, gemm_trans_clbas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::gemm( + context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, + input3_ptr + 1, 4); + + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); +} + +TEST(math_function, zero) { + paddle::framework::Tensor tensor; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* t = tensor.mutable_data({2, 2}, *cpu_place); + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::SetConstant + functor; + functor(context, &tensor, 0); + EXPECT_EQ(t[0], 0); + EXPECT_EQ(t[1], 0); + EXPECT_EQ(t[2], 0); + EXPECT_EQ(t[3], 0); + + functor(context, &tensor, 1); + + EXPECT_EQ(t[0], 1); + EXPECT_EQ(t[1], 1); + EXPECT_EQ(t[2], 1); + EXPECT_EQ(t[3], 1); +} + +template +void GemvTest(int m, int n, bool trans) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor vec_b; + paddle::framework::Tensor vec_c; + auto* cpu_place = new paddle::platform::CPUPlace(); + int b_num = trans ? m : n; + int c_num = trans ? n : m; + + T* data_a = mat_a.mutable_data({m, n}, *cpu_place); + T* data_b = vec_b.mutable_data({b_num}, *cpu_place); + T* data_c = vec_c.mutable_data({c_num}, *cpu_place); + for (int i = 0; i < mat_a.numel(); ++i) { + data_a[i] = static_cast(i); + } + for (int i = 0; i < vec_b.numel(); ++i) { + data_b[i] = static_cast(i); + } + + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::gemv( + context, trans, static_cast(m), static_cast(n), 1., data_a, + data_b, 0., data_c); + + if (!trans) { + for (int i = 0; i < m; ++i) { + T sum = 0.0; + for (int j = 0; j < n; ++j) { + sum += data_a[i * n + j] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } else { + for (int i = 0; i < n; ++i) { + T sum = 0.0; + for (int j = 0; j < m; ++j) { + sum += data_a[j * n + i] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } +} + +TEST(math_function, gemv) { + GemvTest(3, 13, false); + GemvTest(4, 5, false); + GemvTest(12, 7, true); + GemvTest(7, 9, true); +} + +TEST(math_funciton, set_constant) { + paddle::framework::Tensor t; + t.Resize({10, 10}); + t.mutable_data(paddle::platform::CPUPlace()); + auto* ctx = new paddle::platform::CPUDeviceContext(); + paddle::operators::math::set_constant(*ctx, &t, 10); + for (int64_t i = 0; i < t.numel(); ++i) { + PADDLE_ENFORCE_EQ(10, t.data()[i]); + } + delete ctx; +} diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..2ef53a8209940e52390cfe52978f3b32b6dabae6 --- /dev/null +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -0,0 +1,255 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/math_function.h" + +TEST(math_function, notrans_mul_trans) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; + + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr, 6 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({2, 2}, *gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); + + paddle::framework::Copy(out_gpu, *cpu_place, context, &out); + + float* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(out_ptr[0], 5); + EXPECT_EQ(out_ptr[1], 14); + EXPECT_EQ(out_ptr[2], 14); + EXPECT_EQ(out_ptr[3], 50); + delete gpu_place; +} + +TEST(math_function, trans_mul_notrans) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; + + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr, 6 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({3, 3}, *gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); + + paddle::framework::Copy(out_gpu, *cpu_place, context, &out); + + float* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(out_ptr[0], 9); + EXPECT_EQ(out_ptr[1], 12); + EXPECT_EQ(out_ptr[2], 15); + EXPECT_EQ(out_ptr[3], 12); + EXPECT_EQ(out_ptr[4], 17); + EXPECT_EQ(out_ptr[5], 22); + EXPECT_EQ(out_ptr[6], 15); + EXPECT_EQ(out_ptr[7], 22); + EXPECT_EQ(out_ptr[8], 29); + delete gpu_place; +} + +TEST(math_function, gemm_notrans_cublas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu); + paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu); + float* a = input1_gpu.data(); + float* b = input2_gpu.data(); + float* c = input3_gpu.mutable_data(*gpu_place); + + paddle::operators::math::gemm( + context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); + + paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3); + + // numpy code: + // a = np.arange(6).reshape(2, 3) + // b = np.arange(12).reshape(3, 4)[:, 1:] + // c = np.arange(8).reshape(2, 4)[:, 1:] + // out = np.arange(8).reshape(2, 4) + // out[:, 1:] = np.dot(a, b) + c + context.Wait(); + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); + delete gpu_place; +} + +TEST(math_function, gemm_trans_cublas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu); + paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu); + float* a = input1_gpu.data(); + float* b = input2_gpu.data(); + float* c = input3_gpu.mutable_data(*gpu_place); + + paddle::operators::math::gemm( + context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); + + paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3); + context.Wait(); + + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); + delete gpu_place; +} + +template +void GemvTest(int m, int n, bool trans) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor vec_b; + paddle::framework::Tensor vec_c; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* data_a = mat_a.mutable_data({m, n}, *cpu_place); + T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); + T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::framework::Tensor g_mat_a; + paddle::framework::Tensor g_vec_b; + paddle::framework::Tensor g_vec_c; + T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), *gpu_place); + T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), *gpu_place); + T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), *gpu_place); + + for (int i = 0; i < mat_a.numel(); ++i) { + data_a[i] = static_cast(i); + } + for (int i = 0; i < vec_b.numel(); ++i) { + data_b[i] = static_cast(i); + } + + paddle::platform::CUDADeviceContext context(*gpu_place); + paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a); + paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b); + + paddle::operators::math::gemv( + context, trans, static_cast(m), static_cast(n), 1., g_data_a, + g_data_b, 0., g_data_c); + + paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context, + &vec_c); + + if (!trans) { + for (int i = 0; i < m; ++i) { + T sum = 0.0; + for (int j = 0; j < n; ++j) { + sum += data_a[i * n + j] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } else { + for (int i = 0; i < n; ++i) { + T sum = 0.0; + for (int j = 0; j < m; ++j) { + sum += data_a[j * n + i] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } +} + +TEST(math_function, gemv) { + GemvTest(3, 13, false); + GemvTest(3, 13, false); + GemvTest(3, 13, true); + GemvTest(3, 13, true); +} diff --git a/paddle/fluid/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..50f79979d99141c8074c55d2b767285dced49f60 --- /dev/null +++ b/paddle/fluid/operators/math/matmul.h @@ -0,0 +1,145 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +// Implements the logic of numpy matmul: +// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html +// +// but allowing also for a, b to be transposed +// +// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported +// yet. +template +class MatMulFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& a, + bool trans_a, const framework::Tensor& b, bool trans_b, + T alpha, framework::Tensor* out, T beta) { + auto dim_a = a.dims(); + auto dim_b = b.dims(); + + PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(), + "Tensors must all be in the same place."); + PADDLE_ENFORCE_GE(dim_a.size(), 1, + "Input tensor a must be at least 1-dimensional."); + PADDLE_ENFORCE_GE(dim_b.size(), 1, + "Input tensor b must be at least 1-dimensional."); + + std::vector out_dim; + int64_t batch_count = 1; + if (dim_a.size() > 3) { + PADDLE_ENFORCE(dim_b.size() == dim_a.size(), + "The dimensions of X and Y must be the same, and both of " + "them should be %d-dimensional.", + dim_b.size()); + // The first rank-2 dimensions are accumulated on the batch_count, and the + // last two dimensions are used for matrix multiplication. + for (int j = 0; j < dim_a.size() - 2; ++j) { + PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j], + "The %d-th dimension of X and Y must be the same.", + j); + out_dim.push_back(dim_a[j]); + batch_count *= dim_a[j]; + } + } + + int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0, + strideA = 0, strideB = 0; + + switch (dim_a.size()) { + case 1: + // similar to np.matmul: + // prepend dimension 1 (no transpose) or append dimension 1 (transpose) + M = trans_a ? dim_a[0] : 1; + kA = trans_a ? 1 : dim_a[0]; + break; + case 2: + M = trans_a ? dim_a[1] : dim_a[0]; + kA = trans_a ? dim_a[0] : dim_a[1]; + break; + case 3: + batchCountA = dim_a[0]; + M = trans_a ? dim_a[2] : dim_a[1]; + kA = trans_a ? dim_a[1] : dim_a[2]; + strideA = M * kA; + break; + default: + batchCountA = batch_count; + size_t mat_s = dim_a.size() - 2; + M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s]; + kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1]; + strideA = M * kA; + } + + switch (dim_b.size()) { + case 1: + // similar to np.matmul: + // append dimension 1 (no transpose) or prepend dimension 1 (transpose) + kB = trans_b ? 1 : dim_b[0]; + N = trans_b ? dim_b[0] : 1; + break; + case 2: + kB = trans_b ? dim_b[1] : dim_b[0]; + N = trans_b ? dim_b[0] : dim_b[1]; + break; + case 3: + batchCountB = dim_b[0]; + kB = trans_b ? dim_b[2] : dim_b[1]; + N = trans_b ? dim_b[1] : dim_b[2]; + strideB = kB * N; + break; + default: + batchCountB = batch_count; + size_t mat_s = dim_b.size() - 2; + kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s]; + N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1]; + strideB = kB * N; + } + + PADDLE_ENFORCE_EQ( + kA, kB, + "First matrix's width must be equal with second matrix's height."); + if (batchCountA && batchCountB) { + PADDLE_ENFORCE_EQ( + batchCountA, batchCountB, + "When input tensors a and b are both batched, they must have the " + "same batch dimension."); + } + int batchCount = std::max(batchCountA, batchCountB); + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + if (!batchCount) { + // regular matrix multiplication + gemm(context, transA, transB, M, N, kA, alpha, + a.data(), b.data(), beta, out->data()); + } else { + // batched matrix multiplication + batched_gemm( + context, transA, transB, M, N, kA, alpha, a.data(), b.data(), + beta, out->data(), batchCount, strideA, strideB); + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc new file mode 100644 index 0000000000000000000000000000000000000000..746328cd45ada637132ac661c87f4ac4710aeaa4 --- /dev/null +++ b/paddle/fluid/operators/math/maxouting.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/maxouting.h" + +namespace paddle { +namespace operators { +namespace math { + +// All tensors are in NCHW format, and the groups must be greater than 1 +template +class MaxOutFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* output, + int groups) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + for (int ph = 0; ph < groups; ++ph) { + T x = input_data[(new_bindex + new_cindex) * groups + + ph * fea_size + f]; + ele = ele > x ? ele : x; + } + output_data[(new_bindex + new_cindex + f)] = ele; + } + } + } + } +}; + +template +class MaxOutGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, int groups) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0 = (blen + clen) * groups + f; + bool continue_match = true; + int output_idx = blen + clen + f; + for (int g = 0; g < groups && continue_match; ++g) { + int input_idx = input_idx0 + fea_size * g; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; + } + } + } + } + } + } +}; + +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu new file mode 100644 index 0000000000000000000000000000000000000000..68e5dfc3c551958c1f201341b8a704d9306ef150 --- /dev/null +++ b/paddle/fluid/operators/math/maxouting.cu @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void KernelMaxOut(const int nthreads, const T* input_data, + const int channels, const int input_height, + const int input_width, int groups, + T* output_data) { + const int size = input_height * input_width * channels / groups; + const int feat_len = input_height * input_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int batch_idx = i / size; + int batch_offset = i % size; + int channel_idx = batch_offset / feat_len; + int feat_idx = batch_offset % feat_len; + int data_idx = + (batch_idx * size + channel_idx * feat_len) * groups + feat_idx; + T ele = static_cast(-FLT_MAX); + for (int g = 0; g < groups; ++g) { + T x = input_data[data_idx + g * feat_len]; + ele = ele > x ? ele : x; + } + output_data[i] = ele; + } +} +template +__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, + const T* output_data, const T* output_grad, + T* input_grad, const int channels, + const int input_height, const int input_width, + int groups) { + const int size = input_height * input_width * channels / groups; + const int feat_len = input_height * input_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int batch_idx = i / size; + int batch_offset = i % size; + int channel_idx = batch_offset / feat_len; + int feat_idx = batch_offset % feat_len; + int data_idx = + (batch_idx * size + channel_idx * feat_len) * groups + feat_idx; + int max_index = -1; + bool continue_match = true; + for (int g = 0; g < groups && continue_match; ++g) { + if (input_data[data_idx + g * feat_len] == output_data[i]) { + max_index = data_idx + g * feat_len; + continue_match = false; + break; + } + } + if (max_index != -1) { + input_grad[max_index] += output_grad[index]; + } + } +} +/* + * All tensors are in NCHW format. + */ +template +class MaxOutFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* output, + int groups) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + output_data); + } +}; +/* + * All tensors are in NCHW format. + */ +template +class MaxOutGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, int groups) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups); + } +}; + +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h new file mode 100644 index 0000000000000000000000000000000000000000..0e81790f0aba422f6676cd329def95a642a12239 --- /dev/null +++ b/paddle/fluid/operators/math/maxouting.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX __FLT_MAX__ + +template +class MaxOutFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, int groups); +}; + +template +class MaxOutGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, int groups); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..9adb142f14ea984c598855cf30838f0a1f2e5015 --- /dev/null +++ b/paddle/fluid/operators/math/pooling.cc @@ -0,0 +1,760 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class Pool2dFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + T ele = pool_process.initial(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute(ele, input_data[h * input_width + w]); + } + } + int pool_size = (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, (static_cast(pool_size))); + output_data[ph * output_width + pw] = ele; + } + } + input_data += input_stride; + output_data += output_stride; + } + } + } +}; + +/* +* All tensors are in NCHW format. +* Ksize, strides, paddings are two elements. These two elements represent height +* and width, respectively. +*/ +template +class Pool2dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_grad_process, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + int pool_size = (hend - hstart) * (wend - wstart); + float scale = 1.0 / pool_size; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_grad_process.compute( + input_data[h * input_width + w], + output_data[ph * output_width + pw], + output_grad_data[ph * output_width + pw], + input_grad_data[h * input_width + w], + static_cast(scale)); + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + bool stop = false; + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + int input_idx = h * input_width + w; + int output_idx = ph * output_width + pw; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + stop = true; + } + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + int output_idx = (pd * output_height + ph) * output_width + pw; + T ele = pool_process.initial(); + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute( + ele, + input_data[(d * input_height + h) * input_width + w]); + } + } + } + int pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, static_cast(pool_size)); + output_data[output_idx] = ele; + } + } + } + input_data += input_stride; + output_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_grad_process, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + int pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + float scale = 1.0 / pool_size; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_idx = (d * input_height + h) * input_width + w; + int output_idx = + (pd * output_height + ph) * output_width + pw; + pool_grad_process.compute( + input_data[input_idx], output_data[output_idx], + output_grad_data[output_idx], + input_grad_data[input_idx], static_cast(scale)); + } + } + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + bool stop = false; + for (int d = dstart; d < dend && !stop; ++d) { + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + int input_idx = (d * input_height + h) * input_width + w; + int output_idx = + (pd * output_height + ph) * output_width + pw; + + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += + output_grad_data[output_idx]; + stop = true; + } + } + } + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + T1 ele = static_cast(-FLT_MAX); + int index = -1; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (ele < input_data[h * input_width + w]) { + ele = input_data[h * input_width + w]; + index = h * input_width + w; + } + } + } + output_data[ph * output_width + pw] = ele; + mask_data[ph * output_width + pw] = index; + } + } + // offset + input_data += input_stride; + output_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_height = input_grad->dims()[2]; + const int input_width = input_grad->dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T2* mask_data = mask.data(); + const T1* output_grad_data = output_grad.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int n = 0; n < batch_size; ++n) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + for (int pw = 0; pw < output_width; ++pw) { + const int output_idx = ph * output_width + pw; + const int input_idx = static_cast(mask_data[output_idx]); + input_grad_data[input_idx] += output_grad_data[output_idx]; + } + } + // offset + input_grad_data += input_stride; + output_grad_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + int output_idx = (pd * output_height + ph) * output_width + pw; + T1 ele = static_cast(-FLT_MAX); + int index = -1; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_idx = (d * input_height + h) * input_width + w; + if (ele < input_data[input_idx]) { + index = input_idx; + ele = input_data[input_idx]; + } + } + } + } + output_data[output_idx] = ele; + mask_data[output_idx] = index; + } + } + } + // offset + input_data += input_stride; + output_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_depth = input_grad->dims()[2]; + const int input_height = input_grad->dims()[3]; + const int input_width = input_grad->dims()[4]; + const int output_channels = output_grad.dims()[1]; + const int output_depth = output_grad.dims()[2]; + const int output_height = output_grad.dims()[3]; + const int output_width = output_grad.dims()[4]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T2* mask_data = mask.data(); + const T1* output_grad_data = output_grad.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int n = 0; n < batch_size; ++n) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + for (int ph = 0; ph < output_height; ++ph) { + for (int pw = 0; pw < output_width; ++pw) { + const int output_idx = + (pd * output_height + ph) * output_width + pw; + const int input_idx = static_cast(mask_data[output_idx]); + input_grad_data[input_idx] += output_grad_data[output_idx]; + } + } + } + // offset + input_grad_data += input_stride; + output_grad_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu new file mode 100644 index 0000000000000000000000000000000000000000..c65632de9066251a94ab3c32d4382d44f4120c2a --- /dev/null +++ b/paddle/fluid/operators/math/pooling.cu @@ -0,0 +1,1041 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void KernelPool2D(const int nthreads, const T* input_data, + const int channels, const int input_height, + const int input_width, const int output_height, + const int output_width, const int ksize_height, + const int ksize_width, const int stride_height, + const int stride_width, const int padding_height, + const int padding_width, PoolProcess pool_process, + T* output_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int c = (index / output_width / output_height) % channels; + int batch_idx = index / output_width / output_height / channels; + + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + + input_data += (batch_idx * channels + c) * input_height * input_width; + T ele = pool_process.initial(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute(ele, input_data[h * input_width + w]); + } + } + int pool_size = (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, (static_cast(pool_size))); + output_data[index] = ele; + } +} + +template +__global__ void KernelPool2DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_height, + const int input_width, const int output_height, const int output_width, + const int ksize_height, const int ksize_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + PoolProcess pool_process, T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int offsetW = index % input_width + padding_width; + int offsetH = (index / input_width) % input_height + padding_height; + int offsetC = (index / input_width / input_height) % channels; + int batch_idx = index / input_width / input_height / channels; + + int phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + int pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + int phend = min(offsetH / stride_height + 1, output_height); + int pwend = min(offsetW / stride_width + 1, output_width); + T gradient = 0; + T input = input_data[index]; + int output_idx = + (batch_idx * channels + offsetC) * output_height * output_width; + output_data += output_idx; + output_grad += output_idx; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int pool_size = (hend - hstart) * (wend - wstart); + int output_sub_idx = ph * output_width + pw; + pool_process.compute(input, output_data[output_sub_idx], + output_grad[output_sub_idx], gradient, + static_cast(1.0 / pool_size)); + } + } + input_grad[index] = gradient; + } +} + +template +__global__ void KernelMaxPool2DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_height, + const int input_width, const int output_height, const int output_width, + const int ksize_height, const int ksize_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int c = (index / output_width / output_height) % channels; + int batch_idx = index / output_width / output_height / channels; + + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + + input_data += (batch_idx * channels + c) * input_height * input_width; + input_grad += (batch_idx * channels + c) * input_height * input_width; + + T ele = output_data[index]; + int maxIndex = -1; + bool stop = false; + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + if (ele == input_data[h * input_width + w]) { + maxIndex = h * input_width + w; + stop = true; + } + } + } + + if (maxIndex != -1) { + // atomic add + platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]); + } + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class Pool2dFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool2D<<>>( + nthreads, input_data, input_channels, input_height, input_width, + output_height, output_width, ksize_height, ksize_width, stride_height, + stride_width, padding_height, padding_width, pool_process, output_data); + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class Pool2dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool2DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_height, input_width, output_height, output_width, ksize_height, + ksize_width, stride_height, stride_width, padding_height, padding_width, + pool_process, input_grad_data); + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool2DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_height, input_width, output_height, output_width, ksize_height, + ksize_width, stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; + +template +__global__ void KernelPool3D(const int nthreads, const T* input_data, + const int channels, const int input_depth, + const int input_height, const int input_width, + const int output_depth, const int output_height, + const int output_width, const int ksize_depth, + const int ksize_height, const int ksize_width, + const int stride_depth, const int stride_height, + const int stride_width, const int padding_depth, + const int padding_height, const int padding_width, + PoolProcess pool_process, T* output_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int pd = (index / output_width / output_height) % output_depth; + int c = (index / output_width / output_height / output_depth) % channels; + int batch_idx = + index / output_width / output_height / output_depth / channels; + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T ele = pool_process.initial(); + input_data += + (batch_idx * channels + c) * input_depth * input_height * input_width; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute( + ele, input_data[(d * input_height + h) * input_width + w]); + } + } + } + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, static_cast(pool_size)); + output_data[index] = ele; + } +} + +template +__global__ void KernelPool3DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_depth, + const int input_height, const int input_width, const int output_depth, + const int output_height, const int output_width, const int ksize_depth, + const int ksize_height, const int ksize_width, const int stride_depth, + const int stride_height, const int stride_width, const int padding_depth, + const int padding_height, const int padding_width, PoolProcess pool_process, + T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int offsetW = index % input_width + padding_width; + int offsetH = (index / input_width) % input_height + padding_height; + int offsetD = + (index / input_width / input_height) % input_depth + padding_depth; + int offsetC = (index / input_width / input_height / input_depth) % channels; + int batch_idx = index / input_width / input_height / input_depth / channels; + + int pdstart = (offsetD < ksize_depth) + ? 0 + : (offsetD - ksize_depth) / stride_depth + 1; + int phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + int pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + int pdend = min((offsetD) / stride_depth + 1, output_depth); + int phend = min((offsetH) / stride_height + 1, output_height); + int pwend = min((offsetW) / stride_width + 1, output_width); + + T gradient = 0; + T input = input_data[index]; + int output_idx = (batch_idx * channels + offsetC) * output_depth * + output_height * output_width; + output_data += output_idx; + output_grad += output_idx; + + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + int output_sub_idx = (pd * output_height + ph) * output_width + pw; + pool_process.compute(input, output_data[output_sub_idx], + output_grad[output_sub_idx], gradient, + static_cast(1.0 / pool_size)); + } + } + } + input_grad[index] = gradient; + } +} + +template +__global__ void KernelMaxPool3DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_depth, + const int input_height, const int input_width, const int output_depth, + const int output_height, const int output_width, const int ksize_depth, + const int ksize_height, const int ksize_width, const int stride_depth, + const int stride_height, const int stride_width, const int padding_depth, + const int padding_height, const int padding_width, T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int pd = (index / output_width / output_height) % output_depth; + int c = (index / output_width / output_height / output_depth) % channels; + int batch_idx = + index / output_width / output_height / output_depth / channels; + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T ele = output_data[index]; + bool stop = false; + int maxIdx = -1; + input_data += + (batch_idx * channels + c) * input_depth * input_height * input_width; + input_grad += + (batch_idx * channels + c) * input_depth * input_height * input_width; + + for (int d = dstart; d < dend && !stop; ++d) { + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + if (ele == input_data[(d * input_height + h) * input_width + w]) { + stop = true; + maxIdx = (d * input_height + h) * input_width + w; + } + } + } + } + if (maxIdx != -1) { + // atomic add + platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); + } + } +} + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool3D<<>>( + nthreads, input_data, input_channels, input_depth, input_height, + input_width, output_depth, output_height, output_width, ksize_depth, + ksize_height, ksize_width, stride_depth, stride_height, stride_width, + padding_depth, padding_height, padding_width, pool_process, + output_data); + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = + batch_size * input_channels * input_depth * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool3DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_depth, input_height, input_width, output_depth, output_height, + output_width, ksize_depth, ksize_height, ksize_width, stride_depth, + stride_height, stride_width, padding_depth, padding_height, + padding_width, pool_process, input_grad_data); + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool3DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_depth, input_height, input_width, output_depth, output_height, + output_width, ksize_depth, ksize_height, ksize_width, stride_depth, + stride_height, stride_width, padding_depth, padding_height, + padding_width, input_grad_data); + } +}; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; + +template +__global__ void KernelMaxPool2dWithIdx( + const int nthreads, const T1* input_data, const int channels, + const int input_height, const int input_width, const int output_height, + const int output_width, const int ksize_height, const int ksize_width, + const int stride_height, const int stride_width, const int padding_height, + const int padding_width, T1* output_data, T2* mask_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int c = (index / output_width / output_height) % channels; + int batch_idx = index / output_width / output_height / channels; + + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + + input_data += (batch_idx * channels + c) * input_height * input_width; + T1 ele = -FLT_MAX; + int max_index = -1; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_index = h * input_width + w; + if (ele < input_data[input_index]) { + max_index = input_index; + ele = input_data[input_index]; + } + } + } + output_data[index] = ele; + mask_data[index] = max_index; + } +} + +template +__global__ void KernelMaxPool2DWithIdxGrad( + const int nthreads, const T1* output_grad, const T2* mask_data, + const int channels, const int input_height, const int input_width, + const int output_height, const int output_width, const int ksize_height, + const int ksize_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, T1* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; + int c_offset = (index / input_width / input_height) % channels; + int batch_idx = index / input_width / input_height / channels; + + int ph_start = + (h_offset + padding_height < ksize_height) + ? 0 + : (h_offset + padding_height - ksize_height) / stride_height + 1; + int pw_start = + (w_offset + padding_width < ksize_width) + ? 0 + : (w_offset + padding_width - ksize_width) / stride_width + 1; + int ph_end = + min((h_offset + padding_height) / stride_height + 1, output_height); + int pw_end = + min((w_offset + padding_width) / stride_width + 1, output_width); + + T1 gradient = 0; + int input_current_featuremap_idx = h_offset * input_width + w_offset; + int output_idx = + (batch_idx * channels + c_offset) * output_height * output_width; + + mask_data += output_idx; + output_grad += output_idx; + for (int ph = ph_start; ph < ph_end; ++ph) { + for (int pw = pw_start; pw < pw_end; ++pw) { + if (mask_data[ph * output_width + pw] == input_current_featuremap_idx) + gradient += output_grad[ph * output_width + pw]; + } + } + input_grad[index] = gradient; + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool2dWithIdx<<>>( + nthreads, input_data, input_channels, input_height, input_width, + output_height, output_width, ksize_height, ksize_width, stride_height, + stride_width, padding_height, padding_width, output_data, mask_data); + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_channels = input_grad->dims()[1]; + const int input_height = input_grad->dims()[2]; + const int input_width = input_grad->dims()[3]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T2* mask_data = mask.data(); + const T1* output_grad_data = output_grad.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool2DWithIdxGrad<<>>( + nthreads, output_grad_data, mask_data, input_channels, input_height, + input_width, output_height, output_width, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; + +template +__global__ void KernelMaxPool3DWithIdx( + const int nthreads, const T1* input_data, const int channels, + const int input_depth, const int input_height, const int input_width, + const int output_depth, const int output_height, const int output_width, + const int ksize_depth, const int ksize_height, const int ksize_width, + const int stride_depth, const int stride_height, const int stride_width, + const int padding_depth, const int padding_height, const int padding_width, + T1* output_data, T2* mask_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int pd = (index / output_width / output_height) % output_depth; + int c = (index / output_width / output_height / output_depth) % channels; + int batch_idx = + index / output_width / output_height / output_depth / channels; + + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + + T1 ele = -FLT_MAX; + int max_index = -1; + input_data += + (batch_idx * channels + c) * input_depth * input_height * input_width; + + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (ele < input_data[(d * input_height + h) * input_width + w]) { + max_index = (d * input_height + h) * input_width + w; + ele = input_data[max_index]; + } + } + } + } + output_data[index] = ele; + mask_data[index] = max_index; + } +} + +template +__global__ void KernelMaxPool3DWithIdxGrad( + const int nthreads, const T1* output_grad, const T2* mask, + const int channels, const int input_depth, const int input_height, + const int input_width, const int output_depth, const int output_height, + const int output_width, const int ksize_depth, const int ksize_height, + const int ksize_width, const int stride_depth, const int stride_height, + const int stride_width, const int padding_depth, const int padding_height, + const int padding_width, T1* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; + int d_offset = (index / input_width / input_height) % input_depth; + int c_offset = + (index / input_width / input_height / input_depth) % channels; + int batch_idx = index / input_width / input_height / input_depth / channels; + + int pd_start = + (d_offset + padding_depth < ksize_depth) + ? 0 + : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; + int ph_start = + (h_offset + padding_height < ksize_height) + ? 0 + : (h_offset + padding_height - ksize_height) / stride_height + 1; + int pw_start = + (w_offset + padding_width < ksize_width) + ? 0 + : (w_offset + padding_width - ksize_width) / stride_width + 1; + int pd_end = + min((d_offset + padding_depth) / stride_depth + 1, output_depth); + int ph_end = + min((h_offset + padding_height) / stride_height + 1, output_height); + int pw_end = + min((w_offset + padding_width) / stride_width + 1, output_width); + + T1 gradient = 0; + int input_current_feature_map_idx = + (d_offset * input_height + h_offset) * input_width + w_offset; + int output_idx = (batch_idx * channels + c_offset) * output_depth * + output_height * output_width; + mask += output_idx; + output_grad += output_idx; + + for (int pd = pd_start; pd < pd_end; ++pd) { + for (int ph = ph_start; ph < ph_end; ++ph) { + for (int pw = pw_start; pw < pw_end; ++pw) { + if (mask[(pd * output_height + ph) * output_width + pw] == + input_current_feature_map_idx) + gradient += + output_grad[(pd * output_height + ph) * output_width + pw]; + } + } + } + input_grad[index] = gradient; + } +} + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool3DWithIdx<<>>( + nthreads, input_data, input_channels, input_depth, input_height, + input_width, output_depth, output_height, output_width, ksize_depth, + ksize_height, ksize_width, stride_depth, stride_height, stride_width, + padding_depth, padding_height, padding_width, output_data, mask_data); + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_channels = input_grad->dims()[1]; + const int input_depth = input_grad->dims()[2]; + const int input_height = input_grad->dims()[3]; + const int input_width = input_grad->dims()[4]; + const int output_depth = output_grad.dims()[2]; + const int output_height = output_grad.dims()[3]; + const int output_width = output_grad.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T1* output_grad_data = output_grad.data(); + const T2* mask_data = mask.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = + batch_size * input_channels * input_depth * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool3DWithIdxGrad<<>>( + nthreads, output_grad_data, mask_data, input_channels, input_depth, + input_height, input_width, output_depth, output_height, output_width, + ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, + stride_width, padding_depth, padding_height, padding_width, + input_grad_data); + } +}; + +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..1195038f6a067fde776df8013a2a81e7003489a1 --- /dev/null +++ b/paddle/fluid/operators/math/pooling.h @@ -0,0 +1,192 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX \ + __FLT_MAX__ // It might need to be placed in another file, but I'm still + // wondering where to put it. + +/* + * \brief Extracting simple operations from pooling. + * Both MaxPool and AvgPool need "initial", "compute" and "finalize" + * operation. + * MaxPool initializes temp variable to the negative maximum to find the + * maximum value in the pooling field. + * AvgPool initializes temp variable to the zero to accumulate all values + * in pool pooling, and finally takes the average. + * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. + */ +template +class MaxPool { + public: + DEVICE inline T initial() { return static_cast(-FLT_MAX); } + DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } + DEVICE inline void finalize(T& y, const T& pool_field) {} +}; + +template +class AvgPool { + public: + DEVICE inline T initial() { return static_cast(0); } + DEVICE inline void compute(T& y, const T& x) { y += x; } + DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } +}; + +template +class MaxPoolGrad { + public: + DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, + T scale) { + dx += dy * (x == y); + } +}; + +template +class AvgPoolGrad { + public: + DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, + T scale) { + dx += (scale * dy); + } +}; + +/* + * \brief Getting pooling results, and calculating gradient. + * + * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the + * number of channels, H and W is the height and width of feature. + * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the + * number of channels, D, H and W is the depth, height and width of feature. + * + * In max pooling, it is possible that the pooling region has multiple maximum + * elements. In this case, we should compute the gradient of the first maximum + * element. + * This is different from average pooling. So we rewrite the max_pool_grad: + * MaxPool2dGradFunctor, MaxPool3dGradFunctor. + */ +template +class Pool2dFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); +}; + +template +class Pool2dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_compute, framework::Tensor* input_grad); +}; + +template +class MaxPool2dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class Pool3dFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); +}; + +template +class Pool3dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_compute, framework::Tensor* input_grad); +}; + +template +class MaxPool3dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +/* + * \brief Getting max pooling results and corresponding max index, and + * calculating gradient. + * In up-sampling-pooling, it is necessary to know max element index. + * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in + * NCDHW format. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); +}; + +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); +}; + +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc similarity index 100% rename from paddle/operators/math/sampler.cc rename to paddle/fluid/operators/math/sampler.cc diff --git a/paddle/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h similarity index 100% rename from paddle/operators/math/sampler.h rename to paddle/fluid/operators/math/sampler.h diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc new file mode 100644 index 0000000000000000000000000000000000000000..01aa37ab35ce906133f6195df5d7014b4fb23d16 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -0,0 +1,298 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { +namespace math { +template +struct SelectedRowsAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* output) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2.height()); + output->set_height(in1_height); + + auto& in1_rows = input1.rows(); + auto& in2_rows = input2.rows(); + std::vector out_rows; + out_rows.reserve(in1_rows.size() + in2_rows.size()); + + // concat rows + out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end()); + out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end()); + output->set_rows(out_rows); + + auto* out_value = output->mutable_value(); + auto& in1_value = input1.value(); + auto& in2_value = input2.value(); + + auto in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + auto in2_place = input2.place(); + PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + auto out_place = context.GetPlace(); + PADDLE_ENFORCE(platform::is_cpu_place(out_place)); + + auto* out_data = out_value->data(); + auto* in1_data = in1_value.data(); + memory::Copy(boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T)); + + auto* in2_data = in2_value.data(); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T)); + } +}; + +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; + +template +struct SelectedRowsAddTensor { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + const framework::Tensor& input2, framework::Tensor* output) { + auto in1_height = input1.height(); + auto in2_dims = input2.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); + PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + + SetConstant functor; + functor(context, output, 0.0); + + auto* in1_data = in1_value.data(); + auto* out_data = output->data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + out_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + + auto out_eigen = framework::EigenVector::Flatten(*output); + auto in2_eigen = framework::EigenVector::Flatten(input2); + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; + } +}; + +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; + +template +struct SelectedRowsAddTo { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + const int64_t input2_offset, + framework::SelectedRows* input2) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + + auto& in1_rows = input1.rows(); + auto& in2_rows = *(input2->mutable_rows()); + + auto& in1_value = input1.value(); + auto* in2_value = input2->mutable_value(); + + // concat rows + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + auto in2_place = input2->place(); + PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + + auto* in1_data = in1_value.data(); + auto* in2_data = in2_value->data(); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T)); + } +}; + +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; + +template +struct SelectedRowsAddToTensor { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + } +}; + +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; + +// This is a separated namespace for manipulate SelectedRows typed +// data. Like merge duplicated rows, adding two SelectedRows etc. +// +// Another group of functors is called "scatter updates", which means +// use SelectedRows to update a dense tensor with different Ops, like +// add or mul. +namespace scatter { + +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} + +template +struct MergeAdd { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + auto input_rows = input.rows(); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + auto* input_data = input.value().data(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = FindPos(merge_rows, input_rows[i]); + for (int64_t j = 0; j < input_width; j++) { + out_data[out_i * input_width + j] += input_data[i * input_width + j]; + } + } + return out; + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +struct UpdateToTensor { + void operator()(const platform::CPUDeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::ADD: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUB: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] -= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUBBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] - + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + case ScatterOps::MUL: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] *= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIV: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] /= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIVBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] / + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + } + } +}; + +} // namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu new file mode 100644 index 0000000000000000000000000000000000000000..ee3b5d52058f7f04d5eeda1d033171f9eae0d772 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -0,0 +1,385 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { +template +struct SelectedRowsAdd { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* output) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2.height()); + output->set_height(in1_height); + + framework::Vector in1_rows(input1.rows()); + auto& in2_rows = input2.rows(); + std::vector out_rows; + out_rows.reserve(in1_rows.size() + in2_rows.size()); + + // concat rows + out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end()); + out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end()); + output->set_rows(out_rows); + + auto* out_value = output->mutable_value(); + auto& in1_value = input1.value(); + auto& in2_value = input2.value(); + + auto in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + + auto* out_data = out_value->data(); + auto* in1_data = in1_value.data(); + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); + auto in2_place = input2.place(); + PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); + auto out_place = context.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(out_place)); + + memory::Copy( + boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), + reinterpret_cast(context).stream()); + + auto* in2_data = in2_value.data(); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T), context.stream()); + } +}; + +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; + +namespace { +template +__global__ void SelectedRowsAddTensorKernel(const T* selected_rows, + const int64_t* rows, T* tensor_out, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we can not use + // tensor_out[index] += selected_rows[index]; Instead, we have to use + // AtomicAdd to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]); + } +} +} // namespace + +template +struct SelectedRowsAddTensor { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + const framework::Tensor& input2, framework::Tensor* output) { + auto in1_height = input1.height(); + auto in2_dims = input2.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + + auto& in1_value = input1.value(); + framework::Vector in1_rows(input1.rows()); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); + PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* in2_data = input2.data(); + auto* out_data = output->data(); + + SetConstant functor; + functor(context, output, 0.0); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in1_rows.size()); + SelectedRowsAddTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.CUDAData(context.GetPlace()), out_data, + in1_row_numel); + + auto out_eigen = framework::EigenVector::Flatten(*output); + auto in2_eigen = framework::EigenVector::Flatten(input2); + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; + } +}; + +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; + +template +struct SelectedRowsAddTo { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + const int64_t input2_offset, + framework::SelectedRows* input2) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + + framework::Vector in1_rows(input1.rows()); + auto& in2_rows = *(input2->mutable_rows()); + + auto& in1_value = input1.value(); + auto* in2_value = input2->mutable_value(); + + // concat rows + if (in1_rows.size()) { + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + } + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); + auto in2_place = input2->place(); + PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); + + auto* in1_data = in1_value.data(); + auto* in2_data = in2_value->data(); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); + } +}; + +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; + +namespace { +template +__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, + const int64_t* rows, + T* tensor_out, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]); + } +} +} // namespace + +template +struct SelectedRowsAddToTensor { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + framework::Vector in1_rows(input1.rows()); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* in2_data = input2->data(); + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in1_rows.size()); + SelectedRowsAddToTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data, + in1_row_numel); + } +}; + +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; + +namespace scatter { + +template +__global__ void MergeAddKernel(const T* input, const int64_t* input_rows, + T* out, const int64_t* out_rows, + size_t out_rows_size, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t out_idx; + + if (tid == 0) { + for (size_t i = 0; i < out_rows_size; i++) { + if (input_rows[ty] == out_rows[i]) { + out_idx = i; + } + } + } + + __syncthreads(); + + input += ty * row_numel; + out += out_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(out + index, input[index]); + } +} + +template +struct MergeAdd { + framework::SelectedRows operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + framework::Vector input_rows(input.rows()); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + auto* input_data = input.value().data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid1(1, input_rows.size()); + + MergeAddKernel< + T, 256><<(context) + .stream()>>>( + input_data, input_rows.CUDAData(context.GetPlace()), out_data, + out.mutable_rows()->CUDAMutableData(context.GetPlace()), + out.rows().size(), input_width); + return out; + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +__global__ void UpdateToTensorKernel(const T* selected_rows, + const int64_t* rows, const ScatterOps& op, + T* tensor_out, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index]; + } + break; + case ScatterOps::ADD: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] += selected_rows[index]; + } + break; + case ScatterOps::SUB: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] -= selected_rows[index]; + } + break; + case ScatterOps::SUBBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] - tensor_out[index]; + } + break; + case ScatterOps::MUL: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] *= selected_rows[index]; + } + break; + case ScatterOps::DIV: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] /= selected_rows[index]; + } + break; + case ScatterOps::DIVBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] / tensor_out[index]; + } + break; + } +} + +template +struct UpdateToTensor { + void operator()(const platform::CUDADeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { + // NOTE: Use SelectedRowsAddToTensor for better performance + // no additional MergeAdd called. + MergeAdd merge_func; + auto merged_in1 = merge_func(context, input1); + + auto in1_height = merged_in1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = merged_in1.value(); + auto& in1_rows = merged_in1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.template data(); + auto* in2_data = input2->data(); + + dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); + dim3 grid(1, in1_rows.size()); + UpdateToTensorKernel<<< + grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(), + op, in2_data, in1_row_numel); + } +}; +} // namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..510a9ed8be6336448971de305279f607282f7658 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +#define INLINE_FOR2(sizei, sizej) \ + for (int64_t i = 0; i < sizei; i++) \ + for (int64_t j = 0; j < sizej; j++) + +namespace paddle { +namespace operators { +namespace math { + +// SelectedRows + SelectedRows will simplely concat value and rows. +// The real computation happens in dealing with LoDTensor. +template +struct SelectedRowsAdd { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* output); +}; + +template +struct SelectedRowsAddTensor { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::Tensor& input2, framework::Tensor* output); +}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddTo { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const int64_t input2_offset, framework::SelectedRows* input2); +}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddToTensor { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + +namespace scatter { +// functors for manuplating SelectedRows data +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input); +}; + +template +struct Add { + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); + e_out.device(*context.eigen_device()) = e_in1 + e_in2; + return out; + } +}; + +template +struct Mul { + // multiply two SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); + e_out.device(*context.eigen_device()) = e_in1 * e_in2; + return out; + } + // multiply scalar to SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const T input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + e_out.device(*context.eigen_device()) = input2 * e_in1; + return out; + } +}; + +enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; + +// out = seleted_rows_in / tensor +template +struct UpdateToTensor { + void operator()(const DeviceContext& context, const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + +} // namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..db6b41cd52049a32239eca1a39cd730c11ddc2d8 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/math_function.h" + +TEST(selected_rows_functor, cpu_add) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CPUPlace cpu_place; + CPUDeviceContext ctx(cpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), cpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), cpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), cpu_place); + + SelectedRowsAdd add_functor; + add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + auto* out_data = output->value().data(); + // input1 value + EXPECT_EQ(out_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); + functor(ctx, tensor1.get(), 3.0); + + std::unique_ptr tensor2{new Tensor()}; + tensor2->mutable_data(make_ddim({height, row_numel}), cpu_place); + + SelectedRowsAddTensor add_tensor_functor; + add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); + + auto* tensor2_data = tensor2->data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0); +} + +TEST(selected_rows_functor, cpu_add_to) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CPUPlace cpu_place; + CPUDeviceContext ctx(cpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), cpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), cpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + output->set_height(height); + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), cpu_place); + + SelectedRowsAddTo add_to_functor; + add_to_functor(ctx, *selected_rows1, 0, output.get()); + add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + auto* out_data = output->value().data(); + // input1 value + EXPECT_EQ(out_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); + functor(ctx, tensor1.get(), 3.0); + + SelectedRowsAddToTensor add_to_tensor_functor; + add_to_tensor_functor(ctx, *output, tensor1.get()); + + auto* tensor1_data = tensor1->data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); +} diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..b3c4bc9244f9ca1771c5f435788cf3789d7c4574 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +TEST(selected_rows_functor, gpu_add) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CUDAPlace gpu_place(0); + CPUPlace cpu_place; + CUDADeviceContext ctx(gpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), gpu_place); + + SelectedRowsAdd add_functor; + add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + Tensor out_cpu; + Copy(*out_value, cpu_place, ctx, &out_cpu); + ctx.Wait(); + + auto* out_cpu_data = out_cpu.data(); + // input1 value + EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + functor(ctx, tensor1.get(), 3.0); + + std::unique_ptr tensor2{new Tensor()}; + tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); + + SelectedRowsAddTensor add_tensor_functor; + add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); + + Tensor tensor2_cpu; + Copy(*tensor2, cpu_place, ctx, &tensor2_cpu); + ctx.Wait(); + + auto* tensor2_cpu_data = tensor2_cpu.data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0); +} + +TEST(selected_rows_functor, gpu_add_to) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CUDAPlace gpu_place(0); + CPUPlace cpu_place; + CUDADeviceContext ctx(gpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + output->set_height(height); + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), gpu_place); + + SelectedRowsAddTo add_to_functor; + add_to_functor(ctx, *selected_rows1, 0, output.get()); + add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + Tensor out_cpu; + Copy(*out_value, cpu_place, ctx, &out_cpu); + ctx.Wait(); + + auto* out_cpu_data = out_cpu.data(); + // input1 value + EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + functor(ctx, tensor1.get(), 3.0); + + SelectedRowsAddToTensor add_to_tensor_functor; + add_to_tensor_functor(ctx, *output, tensor1.get()); + + Tensor tensor1_cpu; + Copy(*tensor1, cpu_place, ctx, &tensor1_cpu); + ctx.Wait(); + + auto* tensor1_cpu_data = tensor1_cpu.data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor1_cpu_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor1_cpu_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0); +} diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc new file mode 100644 index 0000000000000000000000000000000000000000..0485070fd9b722bdf9011452b2545e065f46d2ac --- /dev/null +++ b/paddle/fluid/operators/math/sequence2batch.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class CopyMatrixRowsFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index) { + size_t* index = index_lod.data(); + auto src_dims = src.dims(); + auto dst_dims = dst.dims(); + PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, + "The src must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL, + "The dst must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], + "The width of src and dst must be same."); + auto height = dst_dims[0]; + auto width = dst_dims[1]; + auto* src_data = src.data(); + auto* dst_data = dst.data(); + for (int i = 0; i < height; ++i) { + if (is_src_index) { + memcpy(dst_data + i * width, src_data + index[i] * width, + width * sizeof(T)); + } else { + memcpy(dst_data + index[i] * width, src_data + i * width, + width * sizeof(T)); + } + } + } +}; + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu new file mode 100644 index 0000000000000000000000000000000000000000..450be80ea2fe67aa0e537f06e15f07b38c5751ea --- /dev/null +++ b/paddle/fluid/operators/math/sequence2batch.cu @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, + int64_t height, int64_t width, + bool is_src_index) { + int idx = threadIdx.x; + int idy = threadIdx.y; + int id = blockIdx.x + idy * GridDimX; + while (id < height) { + int src_idx = is_src_index ? index[id] : id; + int dst_idx = is_src_index ? id : index[id]; + const T* src_data = src + src_idx * width; + T* dst_data = dst + dst_idx * width; + for (int i = idx; i < width; i += BlockDimX) { + dst_data[i] = src_data[i]; + } + id += BlockDimY * GridDimX; + } +} + +template +class CopyMatrixRowsFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index) { + auto src_dims = src.dims(); + auto dst_dims = dst.dims(); + PADDLE_ENFORCE_EQ(src_dims.size(), 2, + "The src must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(dst_dims.size(), 2, + "The dst must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], + "The width of src and dst must be same."); + auto height = dst_dims[0]; + auto width = dst_dims[1]; + auto* src_data = src.data(); + auto* dst_data = dst.data(); + + dim3 threads(128, 8); + dim3 grid(8, 1); + auto stream = context.stream(); + CopyMatrixRowsKernel<<>>( + src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height, + width, is_src_index); + } +}; + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h new file mode 100644 index 0000000000000000000000000000000000000000..00bd25ab613b198e539368f3233d71618dfc758f --- /dev/null +++ b/paddle/fluid/operators/math/sequence2batch.h @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenMatrix = framework::EigenMatrix; + +template +class CopyMatrixRowsFunctor { + public: + // If is_src_index is true, + // copy the indexed rows of input src to the output dst. + // If is_src_index is false, + // copy the input src to the indexed rows of output dst. + // The indexed rows are based on the input index. + void operator()(const DeviceContext& context, const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index); +}; + +template +class LoDTensor2BatchFunctor { + // Calculate the length of each sequence and + // sort sequence index by the length. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} + // + struct SeqInfo { + SeqInfo(int start, int length, int seq_idx) + : start(start), length(length), seq_idx(seq_idx) {} + int start; + int length; + int seq_idx; + }; + + public: + void operator()(const DeviceContext& context, + const framework::LoDTensor& lod_tensor, + framework::LoDTensor& batch, bool is_cal_batch_lod, + bool is_reverse = false) const { + if (!is_cal_batch_lod) { + auto lods = batch.lod(); + PADDLE_ENFORCE_GT(lods.size(), 2UL); + PADDLE_ENFORCE_EQ(lods[1].size(), + static_cast(lod_tensor.dims()[0])); + CopyMatrixRowsFunctor to_batch; + to_batch(context, lod_tensor, lods[1], batch, true); + return; + } + + auto lods = lod_tensor.lod(); + auto lod = lods[0]; + PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); + + std::vector seq_info; + for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { + int length = lod[seq_id + 1] - lod[seq_id]; + seq_info.emplace_back(lod[seq_id], length, seq_id); + } + + std::sort(seq_info.begin(), seq_info.end(), + [](SeqInfo a, SeqInfo b) { return a.length > b.length; }); + + // Calculate the start position of each batch. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // num_batch = 5, + // batchIndex = {b0, b1, b2, b3, b4} + // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} + // batch_start_positions[0] = len(b0) + // batch_start_positions[1] = len(b0) + len(b1) + // batch_start_positions[2] = len(b0) + len(b1) + len(b2) + // ... + // seq2batch_idx[12] = {4, 0, 9, + // 5, 1, 10, + // 6, 2, 11, + // 7, 3, + // 8} + // seq_order = {1, 0, 2}, the sort order. + // where 1 is the second sequence, + // 0 is the first sequence, + // 2 is the third sequence. + // The num_batch represents batch size after rearranging the + // input LodTensor. It is also the maximum length of input sequence. + + paddle::framework::LoD batch_lods; + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + + // batch_lods[0] is the start positions for batch LoDTensor + int num_batch = seq_info[0].length; + batch_lods[0].resize(static_cast(num_batch + 1)); + // batch_lods[1] is the raw index in the input LoDTensor + batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); + // batch_lods[2] is the sort order for the input LoDTensor. + batch_lods[2].resize(seq_info.size()); + + size_t* batch_starts = batch_lods[0].data(); + size_t* seq2batch_idx = batch_lods[1].data(); + batch_starts[0] = 0; + for (int n = 0; n < num_batch; n++) { + auto batch_id = static_cast(batch_starts[n]); + for (size_t i = 0; i < seq_info.size(); ++i) { + int seq_len = seq_info[i].length; + int start = seq_info[i].start; + if (n < seq_len) { + seq2batch_idx[batch_id] = + is_reverse ? start + seq_len - 1 - n : start + n; + batch_id++; + } else { + break; + } + } + batch_starts[n + 1] = static_cast(batch_id); + } + size_t* seq_order = batch_lods[2].data(); + for (size_t i = 0; i < seq_info.size(); ++i) { + seq_order[i] = seq_info[i].seq_idx; + } + batch.set_lod(batch_lods); + + CopyMatrixRowsFunctor to_batch; + to_batch(context, lod_tensor, batch_lods[1], batch, true); + } +}; + +template +class Batch2LoDTensorFunctor { + public: + void operator()(const DeviceContext& context, + const framework::LoDTensor& batch, + framework::LoDTensor& lod_tensor) const { + auto in_lod = batch.lod(); + PADDLE_ENFORCE_GT(in_lod.size(), 2UL); + PADDLE_ENFORCE_EQ(in_lod[1].size(), + static_cast(lod_tensor.dims()[0])); + CopyMatrixRowsFunctor to_seq; + to_seq(context, batch, in_lod[1], lod_tensor, false); + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad8cd825676c77cc204bbb02f88f422d945f1a2c --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class PaddingLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& seq, framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The LoD of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequence_length, num_sequences, sequence_width]."); + + const int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be the " + "maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be the " + "number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + const T* seq_data = seq.data(); + T* padding_data = padding.data(); + for (int64_t i = 0; i < max_sequence_length; ++i) { + for (int64_t j = 0; j < num_sequences; ++j) { + int64_t start_pos = abs_offset_lod[level][j]; + int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos; + if (i < sequence_length) { + // i > 0 => sequence_length > 0 + T scale = + norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; + for (int64_t k = 0; k < sequence_width; ++k) { + padding_data[(i * num_sequences + j) * sequence_width + k] = + seq_data[(start_pos + i) * sequence_width + k] * scale; + } + } else { + memset(padding_data + (i * num_sequences + j) * sequence_width, 0, + sequence_width * sizeof(T)); + } + } + } + } +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + framework::LoDTensor& seq, const framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The LoD of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequnece_length, num_sequences, sequence_width]."); + + const int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be " + "the maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be " + "the number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + const T* padding_data = padding.data(); + T* seq_data = seq.data(); + for (int64_t i = 0; i < num_sequences; ++i) { + int64_t start_pos = abs_offset_lod[level][i]; + int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; + for (int64_t j = 0; j < sequence_length; ++j) { + // sequence_width > j > 0 + T scale = + norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; + for (int64_t k = 0; k < sequence_width; ++k) { + seq_data[(start_pos + j) * sequence_width + k] = + padding_data[(j * num_sequences + i) * sequence_width + k] * + scale; + } + } + } + } +}; + +template class PaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu new file mode 100644 index 0000000000000000000000000000000000000000..c1a390577840db2424185da19f5a5d2b231f25b6 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -0,0 +1,215 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void SequencePaddingKernel(T* padding, T* sequence, + const size_t* sequence_start_positions, + const size_t sequence_width, + const size_t max_sequence_length, + const size_t num_sequences) { + size_t padding_idx = blockIdx.y; + size_t start_pos = sequence_start_positions[padding_idx]; + size_t sequence_length = + sequence_start_positions[padding_idx + 1] - start_pos; + + size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t padding_base_idx = + (sequence_idx * num_sequences + padding_idx) * sequence_width; + size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width; + + if (sequence_idx < sequence_length) { + T scale = NormByTimes ? (1.0f / static_cast(sequence_length)) : 1.0f; + if (Padding) { + /* sequence -> padding */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i]; + } + } else { + /* padding -> sequence */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i]; + } + } + } else if (sequence_idx < max_sequence_length) { + if (Padding) { + /* sequence -> padding */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + padding[padding_base_idx + i] = 0; + } + } + } +} + +template +class PaddingLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor& seq, framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The lod of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequence_length, num_sequences, sequence_width]."); + + int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be the " + "maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be the " + "number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + if (!norm_by_times && num_sequences == 1UL) { + Copy(seq, context.GetPlace(), context, &padding); + padding.Resize(padding_dims); + return; + } + + const int64_t kBlockSize = 512; + + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = num_sequences; + dim3 grid(grid_dim_x, grid_dim_y); + + const T* seq_data = seq.data(); + T* padding_data = padding.data(); + if (norm_by_times) { + SequencePaddingKernel<<>>( + padding_data, const_cast(seq_data), + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } else { + SequencePaddingKernel<<>>( + padding_data, const_cast(seq_data), + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } + } +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + framework::LoDTensor& seq, const framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The lod of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequnece_length, num_sequences, sequence_width]."); + + int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be " + "the maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be " + "the number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + if (!norm_by_times && num_sequences == 1UL) { + Copy(padding, context.GetPlace(), context, &seq); + seq.Resize(seq_dims); + return; + } + + const int64_t kBlockSize = 512; + + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = num_sequences; + dim3 grid(grid_dim_x, grid_dim_y); + + const T* padding_data = padding.data(); + T* seq_data = seq.data(); + if (norm_by_times) { + SequencePaddingKernel<<>>( + const_cast(padding_data), seq_data, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } else { + SequencePaddingKernel<<>>( + const_cast(padding_data), seq_data, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } + } +}; + +template class PaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h new file mode 100644 index 0000000000000000000000000000000000000000..0d84f9dcb3802d82cd385957f66ffe28269b8dfc --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +inline static size_t MaximumSequenceLength(const framework::LoD& lod, + const size_t level) { + const size_t num_sequences = lod[level].size() - 1; + size_t max_sequence_length = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + for (size_t i = 0; i < num_sequences; ++i) { + max_sequence_length = + std::max(max_sequence_length, + abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]); + } + return max_sequence_length; +} + +/* + * \brief Padding/Unpadding LoDTensor to/from normal Tensor of the shape + * [max_sequence_length, num_sequences, sequence_width]. + * + * Padding sequence: + * padding[i] = seq[lod[level][i]] + * Unpadding sequence: + * seq[lod[level][i]] = padding[i] + * + * All sequences will be padded to the same length and stored in a transposed + * shape. + * Example: + * seq (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0) + * + * \param context device context of this functor. + * \param seq LoDTensor which is stored in sequence format, the shape + * is [total_sequence_length, sequence_width] where + * total_sequence_length is the sum of all sequences' + * length. + * \param padding Tensor which is padded to the same length, the shape is + * [max_sequence_length, num_sequences, sequence_width]. + * \param norm_by_times whether dividing sequence's length. + * + * \note transposition is also done in this functor. + */ +template +class PaddingLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, const framework::LoDTensor& seq, + framework::Tensor& padding, bool norm_by_times); +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, framework::LoDTensor& seq, + const framework::Tensor& padding, bool norm_by_times); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..147cb37da2bbb1bad6fc423b3936a1446e17de15 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_padding.h" +#include + +template +void TestSequencePadding(const paddle::framework::LoD& lod, + const size_t sequence_width) { + paddle::framework::LoDTensor cpu_seq; + paddle::framework::LoDTensor cpu_seq_back; + paddle::framework::LoDTensor seq; + paddle::framework::LoDTensor seq_back; + paddle::framework::Tensor padding; + + const size_t level = lod.size() - 1; + auto seq_dims = + paddle::framework::make_ddim({static_cast(lod[level].back()), + static_cast(sequence_width)}); + + cpu_seq.set_lod(lod); + cpu_seq.mutable_data(seq_dims, paddle::platform::CPUPlace()); + for (int64_t i = 0; i < cpu_seq.numel(); ++i) { + cpu_seq.data()[i] = static_cast(i); + } + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + seq = cpu_seq; + } else { + Copy(cpu_seq, *place, *context, &seq); + seq.set_lod(lod); + } + + const size_t max_sequence_length = + paddle::operators::math::MaximumSequenceLength(lod, level); + const size_t num_sequences = lod[level].size() - 1; + auto padding_dims = + paddle::framework::make_ddim({static_cast(max_sequence_length), + static_cast(num_sequences), + static_cast(sequence_width)}); + padding.mutable_data(padding_dims, *place); + paddle::operators::math::PaddingLoDTensorFunctor()( + *context, seq, padding, false); + + seq_back.set_lod(lod); + seq_back.mutable_data(seq_dims, *place); + paddle::operators::math::UnpaddingLoDTensorFunctor()( + *context, seq_back, padding, false); + + if (paddle::platform::is_cpu_place(*place)) { + cpu_seq_back = seq_back; + } else { + Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back); + cpu_seq_back.set_lod(lod); + } + + EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel()); + EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims()); + for (int64_t i = 0; i < cpu_seq.numel(); ++i) { + EXPECT_EQ(cpu_seq.data()[i], cpu_seq_back.data()[i]); + } + + delete place; + delete context; +}; + +TEST(Seq2BatchPadding, CPU) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePadding(lod1, 16); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePadding(lod2, 128); +} + +#ifdef PADDLE_WITH_CUDA +TEST(SequencePadding, CUDA) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePadding(lod1, 16); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePadding(lod2, 128); +} +#endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3b87ec93e19c4dcf99fbacf8f882f9f065430de --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_pooling.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class MaxSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + auto idx_dims = index->dims(); + PADDLE_ENFORCE_GT(in_dims.size(), 1); + PADDLE_ENFORCE_GT(out_dims.size(), 1); + for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, out_dims); + + auto starts = input.lod()[0]; + const T* in_data = input.data(); + T* out_data = output->data(); + int* max_index = index->data(); + + int64_t num_seq = out_dims[0]; + int64_t dim = output->numel() / num_seq; + for (int64_t i = 0; i < num_seq; ++i) { + for (int64_t k = 0; k < dim; ++k) { + out_data[i * dim + k] = in_data[starts[i] * dim + k]; + max_index[i * dim + k] = starts[i]; + } + for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) { + for (int64_t k = 0; k < dim; ++k) { + if (in_data[j * dim + k] > out_data[i * dim + k]) { + out_data[i * dim + k] = in_data[j * dim + k]; + max_index[i * dim + k] = j; + } + } + } + } + } +}; + +template +class MaxSeqPoolGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& out_grad, + const framework::Tensor& index, + framework::LoDTensor* in_grad) { + auto og_dims = out_grad.dims(); + auto ig_dims = in_grad->dims(); + auto idx_dims = index.dims(); + PADDLE_ENFORCE_GT(og_dims.size(), 1); + PADDLE_ENFORCE_GT(ig_dims.size(), 1); + for (int64_t i = 1; i < og_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, og_dims); + + const T* og_data = out_grad.data(); + const int* max_index = index.data(); + T* ig_data = in_grad->data(); + + SetConstant set_zero; + set_zero(context, in_grad, static_cast(0.0)); + int64_t num_seq = og_dims[0]; + int64_t dim = out_grad.numel() / num_seq; + for (int64_t i = 0; i < num_seq; ++i) { + for (int64_t j = 0; j < dim; ++j) { + int step_id = max_index[i * dim + j]; + ig_data[step_id * dim + j] = og_data[i * dim + j]; + } + } + } +}; + +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu new file mode 100644 index 0000000000000000000000000000000000000000..c4267e992a78fd7e80a45ca1dcc1d81ca0f6e8f5 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_pooling.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX __FLT_MAX__ + +template +__global__ void KeMaxSequencePool(const T* input, const size_t* starts, + T* output, int* index, int64_t num_seq, + int64_t dim) { + int dim_idx = threadIdx.x; + int seq_id = blockIdx.x; + if (seq_id >= num_seq) return; + size_t start = starts[seq_id]; + size_t end = starts[seq_id + 1]; + + for (int64_t i = dim_idx; i < dim; i += blockDim.x) { + T max_val = static_cast(-FLT_MAX); + int max_id = -1; + for (size_t step_id = start; step_id < end; step_id++) { + if (max_val < input[step_id * dim + i]) { + max_val = input[step_id * dim + i]; + max_id = step_id; + } + } + output[seq_id * dim + i] = max_val; + index[seq_id * dim + i] = max_id; + } +} + +template +class MaxSeqPoolFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + auto idx_dims = index->dims(); + PADDLE_ENFORCE_GT(in_dims.size(), static_cast(1)); + PADDLE_ENFORCE_GT(out_dims.size(), 1); + for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, out_dims); + + auto starts = input.lod()[0]; + const T* in_data = input.data(); + T* out_data = output->data(); + int* max_index = index->data(); + + int64_t num_seq = out_dims[0]; + int64_t dim = output->numel() / num_seq; + + dim3 threads(256, 1); + dim3 grid(num_seq, 1); + auto stream = context.stream(); + KeMaxSequencePool<<>>( + in_data, starts.CUDAData(context.GetPlace()), out_data, max_index, + num_seq, dim); + } +}; + +template +__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, + T* in_grad, int64_t num_seq, + int64_t dim) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int col_idx = idx % dim; + if (idx < num_seq * dim) { + int step_id = max_index[idx]; + in_grad[step_id * dim + col_idx] = out_grad[idx]; + } +} + +template +class MaxSeqPoolGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& out_grad, + const framework::Tensor& index, + framework::LoDTensor* in_grad) { + auto og_dims = out_grad.dims(); + auto idx_dims = index.dims(); + auto ig_dims = in_grad->dims(); + PADDLE_ENFORCE_GT(og_dims.size(), static_cast(1)); + PADDLE_ENFORCE_GT(ig_dims.size(), static_cast(1)); + for (int64_t i = 1; i < og_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, og_dims); + + const T* og_data = out_grad.data(); + const int* max_index = index.data(); + T* ig_data = in_grad->data(); + + SetConstant set_zero; + set_zero(context, in_grad, static_cast(0.0)); + int64_t num_seq = og_dims[0]; + int64_t dim = out_grad.numel() / num_seq; + + unsigned int blocks = (num_seq * dim + 128 - 1) / 128; + dim3 threads(128, 1); + dim3 grid(blocks, 1); + auto stream = context.stream(); + KeMaxSequencePoolGrad<<>>( + og_data, max_index, ig_data, num_seq, dim); + } +}; + +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..9ba9cad74b54b3d0835ba5f89f6498f9309875cc --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX __FLT_MAX__ + +template +class MaxSeqPoolFunctor { + public: + void operator()(const DeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index); +}; + +template +class MaxSeqPoolGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& out_grad, + const framework::Tensor& index, + framework::LoDTensor* in_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc new file mode 100644 index 0000000000000000000000000000000000000000..427689b9718db6ed8cbb8525712404c1498faf20 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_scale.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_scale.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class ScaleLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + framework::LoDTensor& seq, const T* scales) { + const size_t level = 0; + auto lod = seq.lod(); + const size_t num_seq = lod[level].size() - 1; + size_t seq_width = seq.dims()[1]; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + T* seq_data = seq.mutable_data(context.GetPlace()); + for (size_t i = 0; i < num_seq; ++i) { + for (size_t j = lod[level][i] * seq_width; + j < lod[level][i + 1] * seq_width; ++j) { + seq_data[j] *= scales[i]; + } + } + } +}; + +template class ScaleLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu new file mode 100644 index 0000000000000000000000000000000000000000..7c081ed7f4547c4e1200ea7554f83696e404d021 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_scale.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales, + const size_t seq_width) { + for (int i = threadIdx.x; + i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width; + i += BlockSize) { + int idx = lod[blockIdx.x] * seq_width + i; + seq[idx] *= scales[blockIdx.x]; + } +} + +template +class ScaleLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + framework::LoDTensor& seq, const T* scales) { + const size_t level = 0; + auto lod = seq.lod(); + const size_t num_seq = lod[level].size() - 1; + const size_t seq_width = seq.numel() / seq.dims()[0]; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + T* seq_data = seq.mutable_data(context.GetPlace()); + + SequenceScaleKernel<<< + num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( + seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()), + scales, seq_width); + } +}; + +template class ScaleLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..e8e07fd3156cc516c904a1d3d510a7c6eed5b8a0 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_scale.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Scale a sequence. + * + * All sequences will be padded to the same length and stored in a transposed + * shape. + * Example: + * Given: + * seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * scales = (2, 3, 4, 5) + * then: + * result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3) + + * + * \param context Device context of this functor. + * \param seq LoDTensor which is stored in sequence format, the shape + * is [total_sequence_length, sequence_width] where + * total_sequence_length is the sum of all sequences' + * length. + * \param scales Array. The i-th sequence will be scaled by scales[i]. + * \param num_seq Number of sequence + * + */ +template +class ScaleLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, framework::LoDTensor& seq, + const T* scales); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc new file mode 100644 index 0000000000000000000000000000000000000000..eab31ec567d15a52661bf5ab2373819d8e1e7ddf --- /dev/null +++ b/paddle/fluid/operators/math/softmax.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/math/softmax_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu new file mode 100644 index 0000000000000000000000000000000000000000..733d7eeee6d08241783b8f854b25863f9b756c80 --- /dev/null +++ b/paddle/fluid/operators/math/softmax.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/math/softmax_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..b7d67d5f12d83f015297e75c730de27566e5489b --- /dev/null +++ b/paddle/fluid/operators/math/softmax.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class SoftmaxFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y); +}; + +template +class SoftmaxGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f7c61cb647e899e25f3d9806c993395e898794ab --- /dev/null +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenMatrix = framework::EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = -64.; + return x < kThreshold ? kThreshold : x; + } +}; + +template +void SoftmaxFunctor::operator()(const DeviceContext& context, + const framework::Tensor* X, + framework::Tensor* Y) { + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto shifted_logits = (logits - + logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); +} + +template +void SoftmaxGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad) { + auto softmax = EigenMatrix::From(*y); + auto softmax_grad = EigenMatrix::From(*y_grad); + auto logits_grad = EigenMatrix::From(*x_grad); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = softmax.dimension(kBatchDim); + const int num_classes = softmax.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto dot = (softmax * softmax_grad) + .sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class); + logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax; +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..e02bc02e0022b82085e29ac6c83677c0accfce49 --- /dev/null +++ b/paddle/fluid/operators/math/unpooling.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/unpooling.h" +namespace paddle { +namespace operators { +namespace math { +template +class Unpool2dMaxFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + output_data[index] = input_data[i]; + } + input_data += input_feasize; + indices_data += input_feasize; + output_data += output_feasize; + } + } + } +}; +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const int* indices_data = indices.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + input_grad_data[i] = output_grad_data[index]; + } + input_grad_data += input_feasize; + indices_data += input_feasize; + output_grad_data += output_feasize; + } + } + } +}; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu new file mode 100644 index 0000000000000000000000000000000000000000..2e74270fdf16b470ab6438a3283525c725b2d01b --- /dev/null +++ b/paddle/fluid/operators/math/unpooling.cu @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/unpooling.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { +template +__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, + const int* indices_data, + const int input_height, const int input_width, + const int channels, T* output_data, + const int output_height, + const int output_width) { + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < out_c_stride); + output_data[out_offset + out_index] = input_data[i]; + } +} +template +__global__ void KernelUnpool2dMaxGrad( + const int nthreads, const T* input_data, const int* indices_data, + const int input_height, const int input_width, const int channels, + const T* output_data, const T* output_grad, const int output_height, + const int output_width, T* input_grad) { + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < out_c_stride); + input_grad[i] = output_grad[out_offset + out_index]; + } +} +/* + * All tensors are in NCHW format. + */ +template +class Unpool2dMaxFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int threads = 1024; + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool2dMax<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_height, output_width); + } +}; +/* + * All tensors are in NCHW format. + */ +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int threads = 1024; + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool2dMaxGrad<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_grad_data, output_height, + output_width, input_grad_data); + } +}; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h new file mode 100644 index 0000000000000000000000000000000000000000..f245ba7ba873e7f217c577e0c895f9a8d48e9cdf --- /dev/null +++ b/paddle/fluid/operators/math/unpooling.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { +template +class Unpool2dMaxFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output); +}; +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc new file mode 100644 index 0000000000000000000000000000000000000000..ded0bbc74477656310cb4d464c5709173f20f505 --- /dev/null +++ b/paddle/fluid/operators/math/vol2col.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * vol = [input_channels, input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* col) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col->dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "mismatching."); + + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx = + ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + +/* + * vol = [input_channels,input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Col2VolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* vol) const { + PADDLE_ENFORCE(vol->dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol->dims()[0]; + int input_depth = vol->dims()[1]; + int input_height = vol->dims()[2]; + int input_width = vol->dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "mismatching."); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx = + ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu new file mode 100644 index 0000000000000000000000000000000000000000..35ef24c7f5ffe793a4aefe69807da5ffcf5ced4a --- /dev/null +++ b/paddle/fluid/operators/math/vol2col.cu @@ -0,0 +1,262 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void vol2col(int num_kernels, const T* data_vol, int depth, + int height, int width, int dilation_d, int dilation_h, + int dilation_w, int filter_depth, int filter_height, + int filter_width, int stride_depth, int stride_height, + int stride_width, int padding_depth, int padding_height, + int padding_width, int output_detph, int output_height, + int output_width, T* data_col) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { + int w_out = index % output_width; + int h_out = (index / output_width) % output_height; + int d_out = (index / output_width / output_height) % output_detph; + int channel_in = index / output_width / output_height / output_detph; + int channel_out = channel_in * filter_depth * filter_height * filter_width; + int w_in = w_out * stride_width - padding_width; + int h_in = h_out * stride_height - padding_height; + int d_in = d_out * stride_depth - padding_depth; + + data_col += ((channel_out * output_detph + d_out) * output_height + h_out) * + output_width + + w_out; + data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in; + for (int k = 0; k < filter_depth; ++k) { + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + int d = d_in + k * dilation_d; + int h = h_in + i * dilation_h; + int w = w_in + j * dilation_w; + int col_idx = (k * dilation_d * height + i * dilation_h) * width + + j * dilation_w; + *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && + w < width) + ? data_vol[col_idx] + : 0; + data_col += output_detph * output_height * output_width; + } + } + } + } +} + +/* + * im = [input_channels,intpu_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* col) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col->dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "Mismatching."); + + int num_outputs = + input_channels * output_depth * output_height * output_width; + + const int threads = 1024; + const int blocks = (num_outputs + 1024 - 1) / 1024; + vol2col<<>>( + num_outputs, vol.data(), input_depth, input_height, input_width, + dilations[0], dilations[1], dilations[2], filter_depth, filter_height, + filter_width, strides[0], strides[1], strides[2], paddings[0], + paddings[1], paddings[2], output_depth, output_height, output_width, + col->data()); + } +}; + +template +__global__ void col2vol(int num_kernels, const T* data_col, int depth, + int height, int width, int dilation_d, int dilation_h, + int dilation_w, int filter_depth, int filter_height, + int filter_width, int stride_depth, int stride_height, + int stride_width, int padding_depth, int padding_height, + int padding_width, int output_detph, int output_height, + int output_width, T* data_vol) { + const int d_filter_depth = dilation_d * (filter_depth - 1) + 1; + const int d_filter_height = dilation_h * (filter_height - 1) + 1; + const int d_filter_width = dilation_w * (filter_width - 1) + 1; + + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { + T src_val = 0; + int w = index % width + padding_width; + int h = (index / width) % height + padding_height; + int d = (index / width / height) % depth + padding_depth; + int c = index / width / height / depth; + + // compute the start and end of the output + int w_col_start = + (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1; + int w_col_end = min(w / stride_width + 1, output_width); + int h_col_start = + (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1; + int h_col_end = min(h / stride_height + 1, output_height); + int d_col_start = + (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1; + int d_col_end = min(d / stride_depth + 1, output_detph); + + for (int d_col = d_col_start; d_col < d_col_end; ++d_col) { + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + int d_off = (d - d_col * stride_depth); + int h_off = (h - h_col * stride_height); + int w_off = (w - w_col * stride_width); + if (d_off % dilation_d == 0 && h_off % dilation_h == 0 && + w_off % dilation_w == 0) { + d_off /= dilation_d; + h_off /= dilation_h; + w_off /= dilation_w; + + int data_col_index = + (((((c * filter_depth + d_off) * filter_height + h_off) * + filter_width + + w_off))); + data_col_index = + ((data_col_index * output_detph + d_col) * output_height + + h_col) * + output_width + + w_col; + src_val += data_col[data_col_index]; + } + } + } + } + data_vol[index] = src_val; + } +} + +/* + * im = [input_channels, input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Col2VolFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* vol) const { + PADDLE_ENFORCE(vol->dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol->dims()[0]; + int input_depth = vol->dims()[1]; + int input_height = vol->dims()[2]; + int input_width = vol->dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "Mismatching."); + + int num_kernels = input_channels * input_depth * input_height * input_width; + + const int threads = 1024; + const int blocks = (num_kernels + 1024 - 1) / 1024; + + col2vol<<>>( + num_kernels, col.data(), input_depth, input_height, input_width, + dilations[0], dilations[1], dilations[2], filter_depth, filter_height, + filter_width, strides[0], strides[1], strides[2], paddings[0], + paddings[1], paddings[2], output_depth, output_height, output_width, + vol->data()); + } +}; + +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h new file mode 100644 index 0000000000000000000000000000000000000000..3ce38b2d11f7c64f1004a73ecfc7d85a5a6346ba --- /dev/null +++ b/paddle/fluid/operators/math/vol2col.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +/* + * \brief Converts the feature data of four dimensions(CDHW) into a colData of + * seven dimensions in the Vol2ColFunctor calculation, + * And in the Col2VolFunctor calculation, it is reversed. + * + * \param volData Vol data. + * \param volShape The shape of volData, + * [input_channels, input_depth, input_height, input_width]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * \param dilations dilation data. + * \param 3-dimension [dilation_depth, dilation_height, dilation_width]. + * + * \param strides stride data. + * \param 3-dimension [stride_depth, stride_height, stride_width]. + * + * \param paddings padding data. + * \param 3-dimension [d_pad, h_pad, w_pad]. + * + * The shape of colData is: + * [input_channels, filter_depth, filter_height, filter_width, output_depth, + * output_height, output_width] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * input_channels * filter_depth * filter_height * filter_width, and the width + * is equal output_depth * output_height * output_width. + * + * Reshape: + * shape of colData shape of convolution matrix + * [input_channels, + * filter_depth, + * filter_height, + * filter_width, ======> [height, width] + * output_depth, + * output_height, + * output_width] + * + * \note The caller needs to ensure that volShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Vol2ColFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* col) const; +}; + +template +class Col2VolFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* vol) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..af0a900f80e9bec2c5cfd0ec7dc66beabba049d7 --- /dev/null +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -0,0 +1,127 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/vol2col.h" +#include +#include + +template +void testVol2col() { + paddle::framework::Tensor input; + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor output; + paddle::framework::Tensor output_tmp; + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + + /** + * input = [[0, 1, 2, + * 3, 4, 5] + * [6, 7, 8, + * 9, 10, 11]] + * + * output = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5 + * 6, 7 + * 7, 8 + * 9, 10 + * 10, 11] + * + * col2vol = [[0, 2, 2, + * 3, 8, 5] + * [6, 14, 8, + * 9, 20, 11]] + * + */ + int input_depth = 2; + int input_height = 2; + int input_width = 3; + int filter_size = 2; + std::vector strides({1, 1, 1}); + std::vector paddings({0, 0, 0}); + std::vector dilations({1, 1, 1}); + int output_depth = + (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1; + int output_height = + (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1; + int output_width = + (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1; + + // Vol2Col test + float* input_ptr = + input_tmp.mutable_data({1, input_depth, input_height, input_width}, + paddle::platform::CPUPlace()); + float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input_ptr, arr, 12 * sizeof(float)); + + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + output.mutable_data({1, filter_size, filter_size, filter_size, + output_depth, output_height, output_width}, + *place); + + paddle::operators::math::Vol2ColFunctor vol2col; + vol2col(*context, input, dilations, strides, paddings, &output); + + float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output.data(); + } else { + Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp); + out_cfo_ptr = output_tmp.data(); + } + + for (int i = 0; i < 16; ++i) { + EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]); + } + + // Col2Vol test + float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11}; + memset(input_ptr, 0, 12 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + + paddle::operators::math::Col2VolFunctor col2vol; + col2vol(*context, output, dilations, strides, paddings, &input); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + in_ptr = input_tmp.data(); + } + + for (int i = 0; i < 12; ++i) { + EXPECT_EQ(in_ptr[i], col_2_vol[i]); + } +} + +TEST(math, vol2col) { + testVol2col(); +#ifdef PADDLE_WITH_CUDA + testVol2col(); +#endif // PADDLE_WITH_CUDA +} diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..267b0057bf4894705b7e6eddb8e2e2eaa5c18c8e --- /dev/null +++ b/paddle/fluid/operators/matmul_op.cc @@ -0,0 +1,244 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/matmul_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MatMulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of MatMulOp should not be null."); + PADDLE_ENFORCE(context->HasInput("Y"), + "Input(Y) of MatMulOp should not be null."); + PADDLE_ENFORCE(context->HasOutput("Out"), + "Output(Out) of MatMulOp should not be null."); + + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + bool transpose_x = context->Attrs().Get("transpose_X"); + bool transpose_y = context->Attrs().Get("transpose_Y"); + + PADDLE_ENFORCE_GE(dim_x.size(), 1, + "Input tensor X must be at least 1-dimensional."); + PADDLE_ENFORCE_GE(dim_y.size(), 1, + "Input tensor Y must be at least 1-dimensional."); + + std::vector out_dim; + int64_t batch_count = 1; + if (dim_x.size() > 3) { + PADDLE_ENFORCE_EQ( + dim_y.size(), dim_x.size(), + "The dimensions of X and Y must be the same, and both of " + "them should be %d-dimensional.", + dim_x.size()); + + // The first rank-2 dimensions are accumulated on the batch_count, and the + // last two dimensions are used for matrix multiplication. + for (int j = 0; j < dim_x.size() - 2; ++j) { + PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j], + "The %d-th dimension of X and Y must be the same.", + j); + out_dim.push_back(dim_x[j]); + batch_count *= dim_x[j]; + } + } + + int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0; + bool remove_initial_dim = false, remove_final_dim = false; + + switch (dim_x.size()) { + case 1: + if (transpose_x) { + M = dim_x[0]; + KX = 1; + } else { + M = 1; + KX = dim_x[0]; + remove_initial_dim = true; + } + break; + case 2: + M = transpose_x ? dim_x[1] : dim_x[0]; + KX = transpose_x ? dim_x[0] : dim_x[1]; + break; + case 3: + batchCountX = dim_x[0]; + M = transpose_x ? dim_x[2] : dim_x[1]; + KX = transpose_x ? dim_x[1] : dim_x[2]; + break; + default: + batchCountX = batch_count; + size_t mat_s = dim_x.size() - 2; + M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s]; + KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1]; + break; + } + + switch (dim_y.size()) { + case 1: + if (transpose_y) { + N = dim_y[0]; + KY = 1; + } else { + N = 1; + KY = dim_y[0]; + remove_final_dim = true; + } + break; + case 2: + KY = transpose_y ? dim_y[1] : dim_y[0]; + N = transpose_y ? dim_y[0] : dim_y[1]; + break; + case 3: + batchCountY = dim_y[0]; + KY = transpose_y ? dim_y[2] : dim_y[1]; + N = transpose_y ? dim_y[1] : dim_y[2]; + break; + default: + batchCountY = batch_count; + size_t mat_s = dim_y.size() - 2; + KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s]; + N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1]; + } + + PADDLE_ENFORCE_EQ( + KX, KY, + "First matrix's width must be equal with second matrix's height."); + if (batchCountX && batchCountY) { + PADDLE_ENFORCE_EQ( + batchCountX, batchCountY, + "When Input(X) and Input(Y) are both three dimensional, they " + "must have the same batch dimension."); + } + int batchCount = std::max(batchCountX, batchCountY); + + std::vector dim_out; + if (batchCount) { + if (dim_x.size() > 3) { + dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end()); + } else { + dim_out.push_back(batchCount); + } + } + if (!remove_initial_dim) { + dim_out.push_back(M); + } + if (!remove_final_dim) { + dim_out.push_back(N); + } + if (dim_out.size() == 0) { + // We don't support 0-dimensional Tensors (scalars), so instead + // treat the output as a Tensor of shape (1, ) in this case. + dim_out.push_back(1); + } + context->SetOutputDim("Out", framework::make_ddim(dim_out)); + context->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of MatMul op"); + AddInput("Y", "The second input of MatMul op"); + AddOutput("Out", "The output of MatMul op"); + AddAttr("transpose_X", + R"DOC(If true, use the transpose of `X`. + )DOC") + .SetDefault(false); + AddAttr("transpose_Y", + R"DOC(If true, use the transpose of `Y`. + )DOC") + .SetDefault(false); + AddComment(R"DOC( +MatMul Operator. + + +This operator is used to perform (batched) matrix multiplication +over the last two dimensions of the input tensors `X` and `Y`. + +If a transpose flag is specified, the last two dimensions of the +tensor are transposed. If the tensor is rank-1 of shape [D], then +for `X` it is treated as [1, D] in nontransposed form and as [D, 1] +in transposed form, whereas for `Y` it is the opposite: It is treated +as [D, 1] in nontransposed form and as [1, D] in transposed form. + +Examples without transpose: +- X: [K], Y: [K] => Out: [1] +- X: [K], Y: [K, N] => Out: [N] +- X: [B, M, K], Y: [K] => Out: [B, M] +- X: [M, K], Y: [B, K, N] => Out: [B, M, N] +- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N] +- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N] + +The behavior is designed to be similar to the `numpy.matmul` function. +The differences are: +- When the rank of the input data is less than or equal to 3, it + is similar to the `numpy.matmul` function. +- When the rank of the input is greater than 3, the rank of X and + Y must be equal, and the first `rank - 2` dimensions must be equal. +- We add `transpose_X` and `transpose_Y` flags. + +Both the input `X` and `Y` can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input `X`. + +)DOC"); + } +}; + +class MatMulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = context->GetInputDim("X"); + auto y_dims = context->GetInputDim("Y"); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + + if (context->HasOutput(x_grad_name)) { + context->SetOutputDim(x_grad_name, x_dims); + } + if (context->HasOutput(y_grad_name)) { + context->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad, + ops::MatMulOpGrad); +REGISTER_OP_CPU_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/fluid/operators/matmul_op.cu.cc b/paddle/fluid/operators/matmul_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..988787f0fe40ccdb266327dbff400a72f5dc448b --- /dev/null +++ b/paddle/fluid/operators/matmul_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/matmul_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CUDA_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f4cae3c91cb03980b915a70242e368852322b365 --- /dev/null +++ b/paddle/fluid/operators/matmul_op.h @@ -0,0 +1,242 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/matmul.h" + +namespace paddle { +namespace operators { +namespace matmul_detail { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +using framework::make_ddim; +using framework::vectorize; + +template +class MatMulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor& x = *context.Input("X"); + const Tensor& y = *context.Input("Y"); + Tensor* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + math::MatMulFunctor()( + context.template device_context(), x, transpose_x, y, + transpose_y, T(1), out, T(0)); + } +}; + +template +inline Tensor Reshape(const Tensor& input, const DDim& dims) { + Tensor output; + output.ShareDataWith(input); + output.Resize(dims); + return output; +} + +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +template +Tensor CombineBatchAndM(const Tensor& input) { + Tensor output; + output.ShareDataWith(input); + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + std::vector out_dims = {in_dims[0] * in_dims[1], in_dims[2]}; + output.Resize(make_ddim(out_dims)); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) { + Tensor output; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[1], in_dims[0], in_dims[2]}); + output.mutable_data(context.GetPlace()); + std::vector axis = {1, 0, 2}; + math::Transpose trans; + trans(context, input, &output, axis); + std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; + output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); + } else { + output.ShareDataWith(input); + } + return output; +} + +// Using dimensional constraints on matrix multiplication, it is +// straight-forward to check the following table for when X and Y +// are both matrices. +// +// transpose_X | False | True | False | True +// transpose_Y | False | False | True | True +// -----------+----------+----------+----------+----------- +// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T +// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T +// +// When X is a vector of size K, we treat it instead as a matrix of shape +// (1, K). Similarly, when Y is a vector of size K, we treat it instead as +// a matrix of shape (K, 1). +// +// When X and Y are both 3-dimensional tensors, then the first dimension +// the batch dimension can be ignored and the exact same formulas apply +// as for two matrices. +// +// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end +// up with formulas like +// +// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj} +// +// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N +// to X: (P * M) x K, dOut: (P * M) x N. +template +class MatMulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor& x = *context.Input("X"); + const Tensor& y = *context.Input("Y"); + const Tensor& dout = *context.Input(framework::GradVarName("Out")); + Tensor* dx = context.Output(framework::GradVarName("X")); + Tensor* dy = context.Output(framework::GradVarName("Y")); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + std::vector x_dims = vectorize(x.dims()); + std::vector y_dims = vectorize(y.dims()); + + // If X is a vector, reshape it to a matrix. + if (x_dims.size() == 1) { + x_dims.insert(x_dims.begin(), 1); + } + + // If Y is a vector, reshape it to a matrix. + if (y_dims.size() == 1) { + y_dims.push_back(1); + } + + int batch_count = 0; + // The first rank-2 dimensions are accumulated on the batch_count, and the + // last two dimensions are used for matrix multiplication. + if (x_dims.size() > 3) { + batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1, + std::multiplies()); + } + // Fix the dOut dimensions. + int M = 0, N = 0, batchCountX = 0, batchCountY = 0; + + switch (x_dims.size()) { + case 2: + M = transpose_x ? x_dims[1] : x_dims[0]; + break; + case 3: + batchCountX = x_dims[0]; + M = transpose_x ? x_dims[2] : x_dims[1]; + break; + default: + batchCountX = batch_count; + size_t mat_s = x_dims.size() - 2; + M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s]; + } + + switch (y_dims.size()) { + case 2: + N = transpose_y ? y_dims[0] : y_dims[1]; + break; + case 3: + batchCountY = y_dims[0]; + N = transpose_y ? y_dims[1] : y_dims[2]; + break; + default: + batchCountY = batch_count; + size_t mat_s = y_dims.size() - 2; + N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1]; + } + if (batchCountX && batchCountY) { + PADDLE_ENFORCE_EQ( + batchCountX, batchCountY, + "When Input(X) and Input(Y) are both three dimensional, they " + "must have the same batch dimension."); + } + int batchCount = std::max(batchCountX, batchCountY); + std::vector dout_dims = {M, N}; + if (batchCount) { + if (x_dims.size() > 3) { + dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2); + } else { + dout_dims.insert(dout_dims.begin(), batchCount); + } + } + Tensor X = Reshape(x, make_ddim(x_dims)); + Tensor Y = Reshape(y, make_ddim(y_dims)); + Tensor dOut = Reshape(dout, make_ddim(dout_dims)); + + auto& dev_ctx = context.template device_context(); + if (dx) { + dx->mutable_data(context.GetPlace()); + const Tensor& dOut_for_dX = + (x_dims.size() == 2 && y_dims.size() == 3) + ? CombineBatchAndN(dev_ctx, dOut) + : dOut; + if (x_dims.size() == 2 && y_dims.size() == 3) { + Y = transpose_y ? CombineBatchAndM(Y) + : CombineBatchAndN(dev_ctx, Y); + } + if (transpose_x) { + math::MatMulFunctor()( + dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0)); + } else { + math::MatMulFunctor()( + dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0)); + } + } + + if (dy) { + dy->mutable_data(context.GetPlace()); + const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3) + ? CombineBatchAndM(dOut) + : dOut; + if (y_dims.size() == 2 && x_dims.size() == 3) { + X = transpose_x ? CombineBatchAndN(dev_ctx, X) + : CombineBatchAndM(X); + dOut = CombineBatchAndM(dOut); + } + if (transpose_y) { + math::MatMulFunctor()( + dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0)); + } else { + math::MatMulFunctor()( + dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0)); + } + } + } +}; +} // namespace matmul_detail + +using matmul_detail::MatMulKernel; +using matmul_detail::MatMulGradKernel; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..eff8b927e52c94a4e19bb10c644cbaa34a7a0581 --- /dev/null +++ b/paddle/fluid/operators/max_sequence_len_op.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +class MaxSeqenceLenOp : public framework::OperatorBase { + public: + MaxSeqenceLenOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &rank_table = + scope.FindVar(Input("RankTable"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + int64_t *out_ptr = out->mutable_data({1}, platform::CPUPlace()); + *out_ptr = rank_table.items()[0].length; + } +}; + +class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("RankTable", "The lod_rank_table."); + AddOutput("Out", "The max sequence length."); + AddComment( + R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); + } +}; + +class MaxSeqenceLenInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("RankTable")); + context->SetOutputDim("Out", {1}); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp, + paddle::operators::MaxSeqenceLenOpProtoMaker, + paddle::operators::MaxSeqenceLenInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ce12cd4c4d93327dfcf9eb40ae1ff429f703419 --- /dev/null +++ b/paddle/fluid/operators/maxout_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/maxout_op.h" +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of maxout operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of maxout operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); + AddAttr( + "groups", + R"DOC("Specifies how many groups the input tensor will be split" + "in the channel dimension. And the number of output channel is " + "the number of channels divided by groups.." + )DOC"); + AddComment(R"DOC( +MaxOut Operator. + +Assumed the input shape is (N, Ci, H, W). +The output shape is (N, Co, H, W). +Then $Co = Ci / groups$ and the operator formula is as follows: + +$$ +y_{si+j} = \max_k x_{gsi + sk + j} \\ +g = groups \\ +s = \frac{input.size}{num\_channels} \\ +0 \le i < \frac{num\_channels}{groups} \\ +0 \le j < s \\ +0 \le k < groups +$$ + +Please refer to Paper: + - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf + - Multi-digit Number Recognition from Street View \ + Imagery using Deep Convolutional Neural Networks: \ + https://arxiv.org/pdf/1312.6082v4.pdf + +)DOC"); + } +}; + +class MaxOutOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MaxoutOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MaxoutOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + int groups = ctx->Attrs().Get("groups"); + // check groups > 1 + PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop"); + std::vector output_shape({in_x_dims[0], in_x_dims[1] / groups}); + output_shape.push_back(in_x_dims[2]); + output_shape.push_back(in_x_dims[3]); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class MaxOutOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad, + ops::MaxOutOpGrad); +REGISTER_OP_CPU_KERNEL( + maxout, ops::MaxOutKernel); +REGISTER_OP_CPU_KERNEL( + maxout_grad, + ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3f45c90cde754bcbf985092c5cbf31f134a2eee --- /dev/null +++ b/paddle/fluid/operators/maxout_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/maxout_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + maxout, ops::MaxOutKernel, + ops::MaxOutKernel); +REGISTER_OP_CUDA_KERNEL( + maxout_grad, + ops::MaxOutGradKernel, + ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e5de3e3760b99c9bde9dd86e8851dc2f65e4b2d2 --- /dev/null +++ b/paddle/fluid/operators/maxout_op.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/maxouting.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MaxOutKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + Tensor* out = context.Output("Out"); + int groups = context.template Attr("groups"); + + math::MaxOutFunctor maxout_forward; + maxout_forward(context.template device_context(), *in_x, out, + groups); + } +}; + +template +class MaxOutGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + const Tensor* out = context.Input("Out"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + int groups = context.template Attr("groups"); + auto& device_ctx = context.template device_context(); + math::SetConstant zero; + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0.0)); + math::MaxOutGradFunctor maxout_backward; + maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1043820345a21cc3a7bbdf45d2914f9319c8e708 --- /dev/null +++ b/paddle/fluid/operators/mean_op.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_op.h" + +namespace paddle { +namespace operators { + +class MeanOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MeanOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MeanOp should not be null."); + ctx->SetOutputDim("Out", {1}); + } +}; + +class MeanOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of mean op"); + AddOutput("Out", "The output of mean op"); + AddComment(R"DOC( +Mean Operator. + +Out is a scalar which is the mean of all elements in X. + +)DOC"); + } +}; + +class MeanGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class MeanGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("mean_grad"); + grad_op->SetInput("X", Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); +REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); +REGISTER_OP_CPU_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CPU_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ccf2248760a551174953b8b55dc9a69454074885 --- /dev/null +++ b/paddle/fluid/operators/mean_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/mean_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CUDA_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ae162287da6a5b9e826dec9552262e73468ee58a --- /dev/null +++ b/paddle/fluid/operators/mean_op.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +template +class MeanKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto X = EigenVector::Flatten(*input); + auto y = EigenScalar::From(*output); + auto& place = + *context.template device_context().eigen_device(); + + y.device(place) = X.mean(); + } +}; + +template +class MeanGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto OG = context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar"); + auto IG = context.Output(framework::GradVarName("X")); + IG->mutable_data(context.GetPlace()); + + T ig_size = static_cast(IG->numel()); + Eigen::DSizes bcast(ig_size); + + EigenVector::Flatten(*IG).device( + *context.template device_context().eigen_device()) = + (EigenVector::From(*OG) / ig_size).broadcast(bcast); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..255f55334093213df867852e4d222f0e227e8c5d --- /dev/null +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -0,0 +1,186 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace paddle { +namespace operators { + +using LoD = framework::LoD; + +class MergeLoDTensorOp : public framework::OperatorBase { + public: + MergeLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + + auto &x = scope.FindVar(Input("X"))->Get(); + auto &mask = scope.FindVar(Input("Mask"))->Get(); + auto &in_true = scope.FindVar(Input("InTrue"))->Get(); + auto &in_false = + scope.FindVar(Input("InFalse"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + auto level = static_cast(Attr("level")); + + auto &mask_dim = mask.dims(); + + std::unique_ptr cpu_mask{new framework::LoDTensor()}; + if (platform::is_cpu_place(mask.place())) { + cpu_mask->ShareDataWith(mask); + } else if (platform::is_gpu_place(mask.place())) { +#ifdef PADDLE_WITH_CUDA + framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); +#else + PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); +#endif + } + auto *mask_data = cpu_mask->data(); + + int rank = in_true.dims().size(); + platform::Place place = in_true.place(); + std::type_index data_type = in_true.type(); + framework::DDim in_true_dims = + framework::slice_ddim(in_true.dims(), 1, rank); + + int64_t batch_size = in_true.dims()[0] + in_false.dims()[0]; + + auto in_true_dim_vec = framework::vectorize(in_true_dims); + in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size); + + framework::DDim out_dims = framework::make_ddim(in_true_dim_vec); + out->Resize(out_dims); + out->mutable_data(place, data_type); + + auto *out_lod = out->mutable_lod(); + out_lod->clear(); + size_t out_offset = 0; + + // Build LoDTensor `out` + + size_t in_true_idx = 0; + size_t in_false_idx = 0; + for (size_t i = 0; i < static_cast(mask_dim[0]); i++) { + const framework::LoDTensor *input = nullptr; + size_t *in_idx = nullptr; + if (static_cast(mask_data[i]) == 0) { + input = &in_false; + in_idx = &in_false_idx; + } else { + input = &in_true; + in_idx = &in_true_idx; + } + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + input->lod(), *in_idx, (*in_idx) + 1, 0); + auto &lod_length = lod_and_offset.first; + + framework::AppendLoD(out_lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + + PADDLE_ENFORCE_GE(end_offset, start_offset); + size_t len = end_offset - start_offset; + if (len == 0) { + continue; + } + auto slice = out->Slice(out_offset, out_offset + len); + framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx, + &slice); + out_offset += len; + (*in_idx) += 1; + } + + for (size_t i = 0; i < level; i++) { + out_lod->insert(out_lod->begin(), x.lod()[i]); + } + } +}; + +class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input LoDTensor, contains complete lod information to " + "construct the output"); + AddInput("Mask", "A bool column vector which mask the input"); + AddInput("InTrue", "The True branch to be merged"); + AddInput("InFalse", "The False branch to be merged"); + AddOutput("Out", "The merged output LoDTensor"); + AddAttr("level", "(int) the specific lod level to rank.") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment( + R"DOC( + Merge True and False branches of LoDTensor into a single Output, + with a mask at certain lod level. X is used to obtain complete + lod information. Please refer to SplitLoDTensorOp.)DOC"); + } +}; + +class MergeLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "MergeLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("Mask"), + "MergeLoDTensorOp must has input Mask."); + PADDLE_ENFORCE(context->HasInput("InTrue"), + "MergeLoDTensorOp must has input InTrue."); + PADDLE_ENFORCE(context->HasInput("InFalse"), + "MergeLoDTensorOp must has input InFalse."); + PADDLE_ENFORCE(context->HasOutput("Out"), + "MergeLoDTensorOp must has output Out"); + + auto mask_dim = context->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ(mask_dim.size(), 2); + PADDLE_ENFORCE_EQ(mask_dim[1], 1); + + context->SetOutputDim("Out", context->GetInputDim("InTrue")); + } +}; + +class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("split_lod_tensor"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("Mask", Input("Mask")); + grad_op->SetOutput("OutTrue", InputGrad("InTrue")); + grad_op->SetOutput("OutFalse", InputGrad("InFalse")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp, + ops::MergeLoDTensorOpProtoMaker, + ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker); diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..73a6c0b679310ac4108a915836b5ed497853b38b --- /dev/null +++ b/paddle/fluid/operators/mine_hard_examples_op.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum MiningType { kNone = 0, kMaxNegative, kHardExample }; + +template +bool SortScoreDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +inline bool IsEligibleMining(const MiningType mining_type, const int match_idx, + const float match_dist, + const float neg_dist_threshold) { + if (mining_type == MiningType::kMaxNegative) { + return match_idx == -1 && match_dist < neg_dist_threshold; + } else if (mining_type == MiningType::kHardExample) { + return true; + } else { + return false; + } +} + +inline MiningType GetMiningType(std::string str) { + if (str == "max_negative") { + return MiningType::kMaxNegative; + } else if (str == "hard_example") { + return MiningType::kHardExample; + } else { + return MiningType::kNone; + } +} + +template +class MineHardExamplesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_cls_loss = ctx.Input("ClsLoss"); + auto* in_loc_loss = ctx.Input("LocLoss"); + auto* in_matched_indices = ctx.Input("MatchIndices"); + auto* in_match_dist = ctx.Input("MatchDist"); + float neg_pos_ratio = ctx.Attr("neg_pos_ratio"); + T neg_dist_threshold = + static_cast(ctx.Attr("neg_dist_threshold")); + int sample_size = ctx.Attr("sample_size"); + MiningType mining_type = + GetMiningType(ctx.Attr("mining_type")); + + auto out_neg_indices = ctx.Output("NegIndices"); + auto out_match_indices = + ctx.Output("UpdatedMatchIndices"); + + framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices); + + int batch_size = in_matched_indices->dims()[0]; + int prior_num = in_matched_indices->dims()[1]; + + auto match_indices = framework::EigenMatrix::From(*in_matched_indices); + + auto match_indices_et = + framework::EigenMatrix::From(*out_match_indices); + + auto match_dist = framework::EigenMatrix::From(*in_match_dist); + + const T* cls_loss = in_cls_loss->data(); + const T* loc_loss = nullptr; + if (in_loc_loss) { + loc_loss = in_loc_loss->data(); + } + + std::vector> all_neg_indices; + std::vector batch_starts = {0}; + for (int n = 0; n < batch_size; ++n) { + std::vector> loss_idx; + int neg_sel = 0; + for (int m = 0; m < prior_num; ++m) { + if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m), + neg_dist_threshold)) { + T loss = cls_loss[n * prior_num + m]; + if (mining_type == MiningType::kHardExample && loc_loss != nullptr) { + loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m]; + } + loss_idx.push_back(std::make_pair(loss, m)); + ++neg_sel; + } + } + + if (mining_type == MiningType::kMaxNegative) { + int num_pos = 0; + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) != -1) ++num_pos; + } + neg_sel = std::min(static_cast(num_pos * neg_pos_ratio), neg_sel); + } else if (mining_type == MiningType::kHardExample) { + neg_sel = std::min(sample_size, neg_sel); + } + + std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend); + std::set sel_indices; + std::vector neg_indices; + std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel, + std::inserter(sel_indices, sel_indices.begin()), + [](std::pair& l) -> int { + return static_cast(l.second); + }); + + if (mining_type == MiningType::kHardExample) { + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) > -1) { + if (sel_indices.find(m) == sel_indices.end()) { + match_indices_et(n, m) = -1; + } + } else { + if (sel_indices.find(m) != sel_indices.end()) { + neg_indices.push_back(m); + } + } + } + } else { + neg_indices.resize(sel_indices.size()); + std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin()); + } + + all_neg_indices.push_back(neg_indices); + batch_starts.push_back(batch_starts.back() + neg_indices.size()); + } + + framework::LoD out_neg_indices_lod; + out_neg_indices_lod.emplace_back(batch_starts); + int neg_offset = 0; + auto neg_data = out_neg_indices->mutable_data( + framework::make_ddim({static_cast(batch_starts.back()), 1}), + ctx.GetPlace()); + + for (auto neg_indices : all_neg_indices) { + std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset); + neg_offset += neg_indices.size(); + } + out_neg_indices->set_lod(out_neg_indices_lod); + return; + } +}; + +class MineHardExamplesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ClsLoss"), + "Input(ClsLoss) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchDist"), + "Input(MatchDist) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegIndices"), + "Output(NegIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"), + "Output(UpdatedMatchIndices) of MineHardExamplesOp should " + "not be null."); + + auto cls_loss_dims = ctx->GetInputDim("ClsLoss"); + auto idx_dims = ctx->GetInputDim("MatchIndices"); + auto dis_dims = ctx->GetInputDim("MatchDist"); + + PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL, + "The shape of ClsLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL, + "The shape of MatchIndices is [N, Np]."); + PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL, + "The shape of MatchDist is [N, Np]."); + + if (ctx->HasInput("LocLoss")) { + auto loc_loss_dims = ctx->GetInputDim("LocLoss"); + PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL, + "The shape of LocLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0], + "Batch size of ClsLoss and LocLoss must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], loc_loss_dims[1], + "Prior box number of ClsLoss and LocLoss must be the same."); + } + + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], idx_dims[0], + "Batch size of ClsLoss and MatchIndices must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchIndices must be the same."); + + PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0], + "Batch size of ClsLoss and MatchDist must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchDist must be the same."); + + auto mining_type = + GetMiningType(ctx->Attrs().Get("mining_type")); + + PADDLE_ENFORCE_NE(mining_type, MiningType::kNone, + "mining_type must be hard_example or max_negative"); + + if (mining_type == MiningType::kMaxNegative) { + auto neg_pos_ratio = ctx->Attrs().Get("neg_pos_ratio"); + auto neg_dist_threshold = ctx->Attrs().Get("neg_dist_threshold"); + PADDLE_ENFORCE_GT( + neg_pos_ratio, 0.0f, + "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_GT( + neg_dist_threshold, 0.0f, + "neg_dist_threshold must greater than zero in max_negative mode"); + } else if (mining_type == MiningType::kHardExample) { + auto sample_size = ctx->Attrs().Get("sample_size"); + PADDLE_ENFORCE_GT( + sample_size, 0, + "sample_size must greater than zero in hard_example mode"); + } + + ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("ClsLoss")->type()), + ctx.device_context()); + } +}; + +class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "ClsLoss", + "(Tensor, default Tensor), The classification loss with shape " + "[N, Np], N is the batch size and Np is the number of prior box."); + AddInput("LocLoss", + "(Tensor, optional, default Tensor), The localization loss " + "with shape [N, Np], N is the batch size and Np is the number of " + "prior box.") + .AsDispensable(); + AddInput("MatchIndices", + "(Tensor, Tensor), Matched indices with shape [N, Np], N is " + "the batch size and Np is the number of prior box. " + "MatchIndices[i][j] equal -1 means the j-th prior box in i-th " + "instance does not match any entity, otherwise means it is " + "matched to row."); + AddInput("MatchDist", + "(Tensor, default Tensor) Matched indices with shape [N, " + "Np], N is the batch size and Np is the number of prior box."); + AddAttr("neg_pos_ratio", + "(float) The ratio of the negative box to the positive " + "box. Use only when mining_type is max_negative.") + .SetDefault(1.0); + AddAttr("neg_dist_threshold", + "(float) The negative overlap upper bound for the unmatched " + "predictions. Use only when mining_type is max_negative.") + .SetDefault(0.5); + AddAttr("sample_size", + "(float) The max sample size of negative box. Use only when " + "mining_type is hard_example.") + .SetDefault(0); + AddAttr("mining_type", + "(float) The mining algorithm name, the value is " + "hard_example or max_negative.") + .SetDefault("max_negative") + .InEnum({"hard_example", "max_negative"}); + + AddOutput( + "NegIndices", + "(LoDTensor) The output of negative example indices. a LoDTensor " + "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, " + "and each element is the prior box index. " + "For example, the batch size is 2, the lod is [[0, 1, 2]], " + "the sample 0's box 1(MatchIndices[0][1]) is selected, " + "and sample 1's box 0 is selected. The output NegIndices is " + "[[1], [0]]."); + + AddOutput("UpdatedMatchIndices", + "(Tensor) The output of updated MatchIndices, a tensor with " + "shape [N, Np]. Only update when mining_type is " + "hard_example. The input MatchIndices elements will be update to " + "-1 when it is not in the candidate high loss list of negative " + "examples."); + + AddComment(R"DOC( +Mine hard examples Operator. +This operator implements hard example mining to select a subset of negative box indices. +For each image, selects the box with highest losses. subject to the condition that the +box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. +The selected number is min(sample_size, max_negative_box_number) when mining_type is +hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) +when mining_type is max_negative, where the max_negative_box_number is the count of +MatchIndices elements with value -1. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker); + +REGISTER_OP_CPU_KERNEL( + mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a35d668ccfa8a95cd41c4a943aad6ff915cc7dd --- /dev/null +++ b/paddle/fluid/operators/minus_op.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/minus_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +class MinusOp : public framework::OperatorWithKernel { + public: + MinusOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MinusOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of MinusOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MinusOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ( + x_dims, y_dims, + "Minus operator must take two tensor with same num of elements"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MinusOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The left tensor of minus operator."); + AddInput("Y", "The right tensor of minus operator."); + AddOutput("Out", "The output tensor of minus operator."); + + AddComment(R"DOC( +Minus Operator. + +Equation: + + $Out = X - Y$ + +Both the input `X` and `Y` can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input `X`. + +)DOC"); + } +}; + +class MinusGradMaker : public framework::GradOpDescMakerBase { + public: + using framework::GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + std::vector> ops; + auto x_g = InputGrad("X"); + if (!x_g.empty()) { + auto *x_g_op = new framework::OpDesc(); + x_g_op->SetType("scale"); + x_g_op->SetInput("X", OutputGrad("Out")); + x_g_op->SetOutput("Out", x_g); + x_g_op->SetAttr("scale", 1.0f); + ops.emplace_back(x_g_op); + } + + auto y_g = InputGrad("Y"); + if (!y_g.empty()) { + auto *y_g_op = new framework::OpDesc(); + y_g_op->SetType("scale"); + y_g_op->SetInput("X", OutputGrad("Out")); + y_g_op->SetOutput("Out", y_g); + y_g_op->SetAttr("scale", -1.0f); + ops.emplace_back(y_g_op); + } + + return ops; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker); +REGISTER_OP_CPU_KERNEL( + minus, ops::MinusKernel); diff --git a/paddle/fluid/operators/minus_op.cu b/paddle/fluid/operators/minus_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ce0b1fdc0419fead915511581270fb9984df9dc5 --- /dev/null +++ b/paddle/fluid/operators/minus_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/minus_op.h" + +REGISTER_OP_CUDA_KERNEL( + minus, + paddle::operators::MinusKernel); diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h new file mode 100644 index 0000000000000000000000000000000000000000..dc94cbbeca264536669716beb5318432ba9689a4 --- /dev/null +++ b/paddle/fluid/operators/minus_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class MinusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* left_tensor = context.Input("X"); + auto* right_tensor = context.Input("Y"); + auto* out_tensor = context.Output("Out"); + + out_tensor->mutable_data(context.GetPlace()); + auto& dev = + *context.template device_context().eigen_device(); + framework::EigenVector::Flatten(*out_tensor).device(dev) = + framework::EigenVector::Flatten(*left_tensor) - + framework::EigenVector::Flatten(*right_tensor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f2d16531658d42556756e75895b7250a529f13df --- /dev/null +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/modified_huber_loss_op.h" + +namespace paddle { +namespace operators { + +class ModifiedHuberLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same."); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2."); + PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1."); + + ctx->SetOutputDim("IntermediateVal", x_dims); + ctx->SetOutputDim("Out", {x_dims[0], 1}); + } +}; + +class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input tensor of modified huber loss op. " + "X is 2-D tensor with shape [batch_size, 1]."); + AddInput("Y", + "The target labels of modified huber loss op. " + "The shape of Y is the same as X. Values of Y must be 0 or 1."); + AddOutput("IntermediateVal", + "Variable to save intermediate result which will be reused in " + "backward processing.") + .AsIntermediate(); + AddOutput("Out", "Classification loss for X."); + AddComment(R"DOC( +Modified Huber Loss Operator. + +This operator is used in binary classification problem. The shape of +input X and target Y are both [N, 1] and so is the shape of the output loss. +Since target Y is not differentiable, calculating gradient for Y is illegal. +The formula of modified huber loss is: + +$$ +L(y, f(x)) = +\begin{cases} +(\max(0, 1 - yf(x)))^2, \text{if} \ yf(x) >= -1 \\ + -4yf(x), \quad \text{otherwise} +\end{cases} +$$ + +Make sure the values of target label Y are in {0, 1} here. This operator will +scale values of Y to {-1, +1} when computing losses and gradients. + +)DOC"); + } +}; + +class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"), + "Intermediate value must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) must not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ( + intermediate_dims, x_dims, + "The shape of X and intermediate value must be the same."); + PADDLE_ENFORCE_EQ(out_grad_dims, x_dims, + "The shape of Input(Out@Grad) and X must be the same."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp, + ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad, + ops::ModifiedHuberLossGradOp); + +REGISTER_OP_CPU_KERNEL( + modified_huber_loss, + ops::ModifiedHuberLossKernel); +REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradCPUKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..69ac2b1ed546a4755cd4e8d52a7f8b98b4f0e7b9 --- /dev/null +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/modified_huber_loss_op.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +struct ModifiedHuberLossBackward { + template + HOSTDEVICE void operator()(Tuple t) const { + auto inter_val = thrust::get<1>(t); + auto y_val = thrust::get<2>(t); + auto out_grad = thrust::get<3>(t); + if (inter_val < -1) { + thrust::get<0>(t) = -4 * (2 * y_val - 1) * out_grad; + } else if (inter_val < 1) { + thrust::get<0>(t) = -2 * (1 - inter_val) * (2 * y_val - 1) * out_grad; + } else { + thrust::get<0>(t) = 0; + } + } +}; + +template +class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("Y"); + auto* in1 = context.Input("IntermediateVal"); + auto* in2 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + + if (out0) { + auto counts = framework::product(in1->dims()); + auto y_ptr = thrust::device_pointer_cast(in0->data()); + auto inter_val_ptr = thrust::device_pointer_cast(in1->data()); + auto out_grad_ptr = thrust::device_pointer_cast(in2->data()); + thrust::device_ptr x_grad_ptr( + out0->mutable_data(context.GetPlace())); + + auto iter_begin = thrust::make_zip_iterator( + thrust::make_tuple(x_grad_ptr, inter_val_ptr, y_ptr, out_grad_ptr)); + + auto iter_end = thrust::make_zip_iterator( + thrust::make_tuple(x_grad_ptr + counts, inter_val_ptr + counts, + y_ptr + counts, out_grad_ptr + counts)); + + thrust::for_each(iter_begin, iter_end, ModifiedHuberLossBackward()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + modified_huber_loss, + ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a470a45e13b55a33a8485cac2038ce1cc761a3f3 --- /dev/null +++ b/paddle/fluid/operators/modified_huber_loss_op.h @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +struct CheckLabelValue { + HOSTDEVICE T operator()(const T& val) const { + PADDLE_ASSERT(val == static_cast(0) || val == static_cast(1)); + } +}; + +template +struct ModifiedHuberLossForward { + HOSTDEVICE T operator()(const T& val) const { + if (val < -1) { + return -4 * val; + } else if (val < 1) { + return (1 - val) * (1 - val); + } else { + return static_cast(0); + } + } +}; + +template +class ModifiedHuberLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* out0 = context.Output("IntermediateVal"); + auto* out1 = context.Output("Out"); + + out0->mutable_data(context.GetPlace()); + out1->mutable_data(context.GetPlace()); + auto& place = + *context.template device_context().eigen_device(); + + auto x = EigenVector::Flatten(*in0); + auto y = EigenVector::Flatten(*in1); + // make sure value's of Y in {0, 1} + y.unaryExpr(CheckLabelValue()); + auto inter_val = EigenVector::Flatten(*out0); + // scale y to {-1, +1} and compute x * y + inter_val.device(place) = x * (2 * y - static_cast(1)); + auto loss = EigenVector::Flatten(*out1); + loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward()); + } +}; + +// CPU backward kernel +template +class ModifiedHuberLossGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("Y"); + auto* in1 = context.Input("IntermediateVal"); + auto* in2 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + + if (out0) { + const T* y_ptr = in0->data(); + const T* inter_val_ptr = in1->data(); + const T* out_grad_ptr = in2->data(); + size_t counts = static_cast(framework::product(in1->dims())); + T* x_grad_ptr = out0->mutable_data(context.GetPlace()); + for (size_t i = 0; i < counts; ++i) { + if (inter_val_ptr[i] < -1) { + x_grad_ptr[i] = -4 * (2 * y_ptr[i] - 1) * out_grad_ptr[i]; + } else if (inter_val_ptr[i] < 1) { + x_grad_ptr[i] = -2 * (1 - inter_val_ptr[i]) * (2 * y_ptr[i] - 1) * + out_grad_ptr[i]; + } else { + x_grad_ptr[i] = 0; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3950ac99da93687596f62c6f020f4add1fb04ba --- /dev/null +++ b/paddle/fluid/operators/momentum_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/momentum_op.h" + +namespace paddle { +namespace operators { + +class MomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(param) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(grad) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Velocity"), + "Input(velocity) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of Momentum should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), + "Output(VelocityOut) of Momentum should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad input of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Velocity"), + "Param and Velocity of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, + "Learning_rate should be a scalar"); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("VelocityOut", param_dim); + } +}; + +class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated"); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter"); + AddInput("Velocity", + "(Tensor, default Tensor) " + "Input velocity (corresponding to the parameter) " + "that has to be updated"); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "Input learning rate"); + + AddOutput("ParamOut", + "(Tensor) This output is updated parameter. " + "It shared memory with Input(Param)."); + AddOutput("VelocityOut", + "(Tensor) This output is updated velocity. " + "It shared memory with Input(Velocity)."); + + AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("use_nesterov", + "(bool, default false) " + "Use Nesterov Momentum") + .SetDefault(false); + AddComment(R"DOC( +Momentum Optimizer. + +This optimizer has a flag for Nestrov Momentum. +The update equations are as follows: + +$$ +velocity = mu * velocity + gradient \\ +if (use\_nesterov): \\ + param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\ +else: \\ + param = param - learning\_rate * velocity. \\ +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); +REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..28a14cd4b219b86ba4922a3485b99c2c861a74d9 --- /dev/null +++ b/paddle/fluid/operators/momentum_op.cu @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +__global__ void MomentumKernel(const T* p, const T* g, const T* v, + const T* learning_rate, const T mu, + const int64_t num, bool use_nesterov, T* p_out, + T* v_out) { + T lr = learning_rate[0]; + if (use_nesterov) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + T g_val = g[i]; + T v_new = v[i] * mu + g_val; + v_out[i] = v_new; + p_out[i] = p[i] - (g_val - v_new * mu) * lr; + } + } else { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + T v_new = v[i] * mu + g[i]; + v_out[i] = v_new; + p_out[i] = p[i] - lr * v_new; + } + } +} + +template +class MomentumOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + T* p_out = param_out->mutable_data(ctx.GetPlace()); + T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto* p = param->data(); + auto* v = velocity->data(); + auto* g = grad->data(); + auto* lr = learning_rate->data(); + + int block = 512; + int grid = (param->numel() + block - 1) / block; + MomentumKernel<<>>( + p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel, + ops::MomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fdab86b24eefe15b85f4ca6a49e54ea67c1e7bdf --- /dev/null +++ b/paddle/fluid/operators/momentum_op.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class MomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto p_out = framework::EigenVector::Flatten(*param_out); + auto v_out = framework::EigenVector::Flatten(*velocity_out); + + auto p = framework::EigenVector::Flatten(*param); + auto v = framework::EigenVector::Flatten(*velocity); + auto g = framework::EigenVector::Flatten(*grad); + auto* lr = learning_rate->data(); + + v_out = v * mu + g; + if (use_nesterov) { + p_out = p - (g - v_out * mu) * lr[0]; + } else { + p_out = p - lr[0] * v_out; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9375d8ea1297f5201b6294f61119f6b5603d988 --- /dev/null +++ b/paddle/fluid/operators/mul_op.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mul_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MulOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MulOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + int x_num_col_dims = ctx->Attrs().Get("x_num_col_dims"); + int y_num_col_dims = ctx->Attrs().Get("y_num_col_dims"); + + VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims + << " x_num_col_dims=" << x_num_col_dims + << " y_num_col_dims=" << y_num_col_dims; + + PADDLE_ENFORCE_GT( + x_dims.size(), x_num_col_dims, + "The input tensor X's rank of MulOp should be larger than " + "x_num_col_dims."); + PADDLE_ENFORCE_GT( + y_dims.size(), y_num_col_dims, + "The input tensor Y's rank of MulOp should be larger than " + "y_num_col_dims."); + + auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); + auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); + + PADDLE_ENFORCE_EQ( + x_mat_dims[1], y_mat_dims[0], + "First matrix's width must be equal with second matrix's height."); + std::vector output_dims; + output_dims.reserve( + static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); + + for (int i = 0; i < x_num_col_dims; ++i) { + output_dims.push_back(x_dims[i]); + } + + for (int i = y_num_col_dims; i < y_dims.size(); ++i) { + output_dims.push_back(y_dims[i]); + } + + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), The first input tensor of mul op."); + AddInput("Y", "(Tensor), The second input tensor of mul op."); + AddOutput("Out", "(Tensor), The output tensor of mul op."); + AddAttr( + "x_num_col_dims", + R"DOC((int, default 1), The mul_op can take tensors with more than two + dimensions as its inputs. If the input $X$ is a tensor with more + than two dimensions, $X$ will be flattened into a two-dimensional + matrix first. The flattening rule is: the first `num_col_dims` + will be flattened to form the first dimension of the final matrix + (the height of the matrix), and the rest `rank(X) - num_col_dims` + dimensions are flattened to form the second dimension of the final + matrix (the width of the matrix). As a result, height of the + flattened matrix is equal to the product of $X$'s first + `x_num_col_dims` dimensions' sizes, and width of the flattened + matrix is equal to the product of $X$'s last `rank(x) - num_col_dims` + dimensions' size. For example, suppose $X$ is a 6-dimensional + tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3. + Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = + [24, 30]. + )DOC") + .SetDefault(1) + .EqualGreaterThan(1); + AddAttr( + "y_num_col_dims", + R"DOC((int, default 1), The mul_op can take tensors with more than two, + dimensions as its inputs. If the input $Y$ is a tensor with more + than two dimensions, $Y$ will be flattened into a two-dimensional + matrix first. The attribute `y_num_col_dims` determines how $Y$ is + flattened. See comments of `x_num_col_dims` for more details. + )DOC") + .SetDefault(1) + .EqualGreaterThan(1); + AddComment(R"DOC( +Mul Operator. + +This operator is used to perform matrix multiplication for input $X$ and $Y$. + +The equation is: + +$$Out = X * Y$$ + +Both the input $X$ and $Y$ can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input $X$. + +)DOC"); + } +}; + +class MulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + auto x_mat_dims = framework::flatten_to_2d( + x_dims, ctx->Attrs().Get("x_num_col_dims")); + auto y_mat_dims = framework::flatten_to_2d( + y_dims, ctx->Attrs().Get("y_num_col_dims")); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, + ops::MulOpShapeInference, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); +REGISTER_OP_CPU_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CPU_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f605fd84fb802a079f9ed13afc032cc2d6c2b0c --- /dev/null +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mul_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CUDA_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..745989f07f3646ab5d59f3d292030f3eb52dac49 --- /dev/null +++ b/paddle/fluid/operators/mul_op.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + const Tensor* y = context.Input("Y"); + Tensor* z = context.Output("Out"); + const Tensor x_matrix = + x->dims().size() > 2 + ? framework::ReshapeToMatrix( + *x, context.template Attr("x_num_col_dims")) + : *x; + const Tensor y_matrix = + y->dims().size() > 2 + ? framework::ReshapeToMatrix( + *y, context.template Attr("y_num_col_dims")) + : *y; + + z->mutable_data(context.GetPlace()); + auto z_dim = z->dims(); + if (z_dim.size() != 2) { + z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + math::matmul( + context.template device_context(), x_matrix, false, + y_matrix, false, 1, z, 0); + if (z_dim.size() != 2) { + z->Resize(z_dim); + } + } +}; + +template +class MulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int x_num_col_dims = ctx.template Attr("x_num_col_dims"); + int y_num_col_dims = ctx.template Attr("y_num_col_dims"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; + const Tensor* dout = ctx.Input(framework::GradVarName("Out")); + + Tensor dout_mat; + dout_mat.ShareDataWith(*dout); + dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], + framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); + + Tensor* dx = ctx.Output(framework::GradVarName("X")); + Tensor* dy = ctx.Output(framework::GradVarName("Y")); + auto& dev_ctx = ctx.template device_context(); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + Tensor dx_matrix = dx->dims().size() > 2 + ? framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; + + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N + math::matmul(dev_ctx, dout_mat, false, y_matrix, true, + 1, &dx_matrix, 0); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + Tensor dy_matrix = dy->dims().size() > 2 + ? framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; + // dy = x' * dout. dy K x N, dout : M x N, x : M x K + math::matmul(dev_ctx, x_matrix, true, dout_mat, false, + 1, &dy_matrix, 0); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2934f69cc9b2e50bdd5cbdf04deeaf5ca120e2c --- /dev/null +++ b/paddle/fluid/operators/multiclass_nms_op.cc @@ -0,0 +1,384 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +constexpr int64_t kOutputDim = 6; +constexpr int64_t kBBoxSize = 4; + +class MultiClassNMSOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("BBoxes"), + "Input(BBoxes) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scores"), + "Input(Scores) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MultiClassNMS should not be null."); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + + PADDLE_ENFORCE_EQ(box_dims.size(), 2, + "The rank of Input(BBoxes) must be 2."); + PADDLE_ENFORCE_EQ(score_dims.size(), 3, + "The rank of Input(Scores) must be 3."); + PADDLE_ENFORCE_EQ(box_dims[1], 4, + "The 2nd dimension of Input(BBoxes) must be 4, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2], + "The 1st dimensiong of Input(BBoxes) must be equal to " + "3rd dimension of Input(Scores), which represents the " + "predicted bboxes."); + + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + ctx->SetOutputDim("Out", {box_dims[0], 6}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Scores")->type()), + ctx.device_context()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +class MultiClassNMSKernel : public framework::OpKernel { + public: + void NMSFast(const Tensor& bbox, const Tensor& scores, + const T score_threshold, const T nms_threshold, const T eta, + const int64_t top_k, std::vector* selected_indices) const { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void MultiClassNMS(const framework::ExecutionContext& ctx, + const Tensor& scores, const Tensor& bboxes, + std::map>& indices, + int& num_nmsed_out) const { + int64_t background_label = ctx.Attr("background_label"); + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + + int64_t class_num = scores.dims()[0]; + int64_t predict_dim = scores.dims()[1]; + int num_det = 0; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + Tensor score = scores.Slice(c, c + 1); + NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k, + &(indices[c])); + num_det += indices[c].size(); + } + + num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(indices); + num_nmsed_out = keep_top_k; + } + } + + void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, + std::map>& selected_indices, + Tensor* outs) const { + int predict_dim = scores.dims()[1]; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + odata[count * kOutputDim] = label; // label + odata[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); + auto* outs = ctx.Output("Out"); + + auto score_dims = scores->dims(); + + int64_t batch_size = score_dims[0]; + int64_t class_num = score_dims[1]; + int64_t predict_dim = score_dims[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + std::map> indices; + int num_nmsed_out = 0; + MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + T* od = outs->mutable_data({1}, ctx.GetPlace()); + od[0] = -1; + } else { + outs->mutable_data({num_kept, kOutputDim}, ctx.GetPlace()); + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(ins_score, *boxes, all_indices[i], &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("BBoxes", + "(Tensor) A 2-D Tensor with shape [M, 4] represents the " + "predicted locations of M bounding bboxes. Each bounding box " + "has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax]."); + AddInput("Scores", + "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "predicted confidence predictions. N is the batch size, C is the " + "class number, M is number of bounding boxes. For each category " + "there are total M scores which corresponding M bounding boxes. " + " Please note, M is equal to the 1st dimension of BBoxes. "); + AddAttr( + "background_label", + "(int64_t, defalut: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with low " + "confidence score. If not provided, consider all boxes."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections to be kept according to the " + "confidences aftern the filtering detections based on " + "score_threshold"); + AddAttr("nms_threshold", + "(float, defalut: 0.3) " + "The threshold to be used in NMS.") + .SetDefault(0.3); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS.") + .SetDefault(1.0); + AddAttr("keep_top_k", + "(int64_t) " + "Number of total bboxes to be kept per image after NMS " + "step. -1 means keeping all bboxes after NMS step."); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax], No is the total " + "number of detections in this mini-batch. For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to do multi-class non maximum suppression (NMS) on a batched +of boxes and scores. + +In the NMS step, this operator greedily selects a subset of detection bounding +boxes that have high scores larger than score_threshold, if providing this +threshold, then selects the largest nms_top_k confidences scores if nms_top_k +is larger than -1. Then this operator pruns away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. + +Aftern NMS step, at most keep_top_k number of total bboxes are to be kept +per image if keep_top_k is larger than -1. + +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bbox for this image. If there is no detected boxes +for all images, all the elements in LoD are 0, and the Out only contains one +value which is -1. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, + ops::MultiClassNMSOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f89b00376ba7a759419fa60efe80575b6a8d1f2e --- /dev/null +++ b/paddle/fluid/operators/multiplex_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/multiplex_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class MultiplexOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null."); + PADDLE_ENFORCE(!ctx->Inputs("X").empty(), + "MultiInput(X) shouldn't be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null."); + auto ids_dim = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE( + ids_dim.size() == 2 && ids_dim[1] == 1, + "The index tensor must be a vector with size batchSize x 1."); + + auto ins_dims = ctx->GetInputsDim("X"); + auto num_ins = ins_dims.size(); + PADDLE_ENFORCE(num_ins > 1, + "multiplex operator should have more than " + "one candidate input tensors."); + + auto in_dim = ins_dims[0]; + PADDLE_ENFORCE(in_dim.size() >= 2, + "The rank of candidate tensors must be not less than 2."); + for (size_t i = 1; i < num_ins; i++) { + auto dim = ins_dims[i]; + PADDLE_ENFORCE(in_dim == dim, + "All the candidate tensors must have the same size."); + } + ctx->SetOutputDim("Out", in_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.MultiInput("X")[0]->type()), + ctx.device_context()); + } +}; + +class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ids", "The index tensor of multiplex operator."); + AddInput("X", "The candidate tensors of multiplex operator.") + .AsDuplicable(); + AddOutput("Out", "The output tensor of multiplex operator."); + AddComment(R"DOC( +Multiplex Operator. + +Multiplex multiple tensors according to the index provided by the index tensor. + +Ids: the index tensor. +X[0 : N - 1]: the candidate tensors for output (N >= 2). +For each index i from 0 to batchSize - 1, the output is the i-th row of the +the (Ids[i])-th tensor. + +For i-th row of the output tensor: + +$$y[i] = x_{k}[i]$$ + +where `y` is the output tensor, `x_{k}` is the k-th input tensor, +and `k = Ids[i]`. + +)DOC"); + } +}; + +class MultiplexGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null."); + PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(), + "Output(X@Grad) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.MultiInput("X")[0]->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); +REGISTER_OP_CPU_KERNEL( + multiplex, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel); +REGISTER_OP_CPU_KERNEL( + multiplex_grad, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..3ef7ef1dfcd04d59573bc6c726fef757f5f2ce23 --- /dev/null +++ b/paddle/fluid/operators/multiplex_op.cu @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/multiplex_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MultiplexGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto ins = ctx.MultiInput("X"); + auto* ids = ctx.Input("Ids"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + // copy index to cpu + Tensor index_t_cpu; + Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.cuda_device_context().stream(); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); + PADDLE_ENFORCE_LT((size_t)k, ins.size(), + "index exceeds the number of candidate tensors."); + memory::Copy(place, out->data() + i * cols, place, + ins[k]->data() + i * cols, cols * sizeof(T), stream); + } + } +}; + +template +class MultiplexGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto* ids = ctx.Input("Ids"); + auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); + for (size_t i = 0; i < d_ins.size(); i++) { + if (d_ins[i]) { + d_ins[i]->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*d_ins[i]); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); + } + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + // copy index to cpu + Tensor index_t_cpu; + Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); + auto* index = index_t_cpu.data(); + + auto stream = ctx.cuda_device_context().stream(); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (d_ins[k]) { + memory::Copy(place, d_ins[k]->data() + i * cols, place, + d_out->data() + i * cols, cols * sizeof(T), stream); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + multiplex, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel); +REGISTER_OP_CUDA_KERNEL( + multiplex_grad, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h new file mode 100644 index 0000000000000000000000000000000000000000..682117cb1b4581560d6b4a615d97e2d18a91ffd6 --- /dev/null +++ b/paddle/fluid/operators/multiplex_op.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace paddle { +namespace operators { + +template +class MultiplexCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto ins = ctx.MultiInput("X"); + auto ids = ctx.Input("Ids"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto index = ids->data(); + platform::CPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); + PADDLE_ENFORCE_LT(static_cast(k), ins.size(), + "index exceeds the number of candidate tensors."); + memory::Copy(place, out->data() + i * cols, place, + ins[k]->data() + i * cols, cols * sizeof(T)); + } + } +}; + +template +class MultiplexGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* ids = ctx.Input("Ids"); + auto ins = ctx.MultiInput("X"); + auto d_ins = + ctx.MultiOutput(framework::GradVarName("X")); + for (size_t i = 0; i < d_ins.size(); i++) { + if (d_ins[i]) { + d_ins[i]->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*d_ins[i]); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); + } + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto* index = ids->data(); + platform::CPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (d_ins[k]) { + memory::Copy(place, d_ins[k]->data() + i * cols, place, + d_out->data() + i * cols, cols * sizeof(T)); + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt similarity index 100% rename from paddle/operators/nccl/CMakeLists.txt rename to paddle/fluid/operators/nccl/CMakeLists.txt diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a8ce932ec51a3d85ef04a2c07e08186929632f2 --- /dev/null +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace platform {} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h new file mode 100644 index 0000000000000000000000000000000000000000..6e78613239e6c401bad5aa80746000c5b47cd031 --- /dev/null +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +constexpr int kInvalidGPUId = -1; + +struct Communicator { + std::vector comms_; + std::unordered_map comm_id_map_; + bool inited_; + + Communicator() {} + + int GetCommId(int device_id) const { return comm_id_map_.at(device_id); } + + void InitAll(const std::vector& gpus) { + comms_.resize(gpus.size()); + inited_ = false; + for (size_t i = 0; i < gpus.size(); ++i) { + comm_id_map_[gpus[i]] = i; + } + PADDLE_ENFORCE( + dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data())); + inited_ = true; + } + + ~Communicator() { + if (inited_) { + for (size_t i = 0; i < comms_.size(); ++i) { + // FIXME(dzh) : PADDLE_ENFORCE return void + dynload::ncclCommDestroy(comms_[i]); + } + } + } + + DISABLE_COPY_AND_ASSIGN(Communicator); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..52420ceba0de0323dae000aa301ce7838b3311b6 --- /dev/null +++ b/paddle/fluid/operators/nccl_op.cc @@ -0,0 +1,224 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" + +namespace paddle { +namespace operators { + +// NCCLinitOp +class NCCLInitOp : public framework::OperatorBase { + public: + NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + const auto &name = Output("Communicator"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), + "Can not find variable '%s' in the scope.", name); + std::vector gpus = Attr>("gpus"); + PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty."); + + if (scope.FindVar(name) == nullptr) { + PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); + } + + platform::Communicator *comm = + scope.FindVar(name)->GetMutable(); + comm->InitAll(gpus); + } +}; + +class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Communicator", + "Create Communicator for communicating between gpus"); + AddAttr>("gpus", "(vector) GPU id lists"); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddComment(R"DOC( +NCCLInit Operator. + +Create communicator. + +)DOC"); + } +}; + +// AllReduceOp +class NCCLAllReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of AllReduce op input should not be NULL"); + PADDLE_ENFORCE( + ctx->HasInput("Communicator"), + " Input(Communicator) of AllReduce op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Input(X) of AllReduce op input should not be NULL"); + + auto x_dims = ctx->GetInputsDim("X"); + + std::string reduction = ctx->Attrs().Get("reduction"); + PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), + "invalid reduction."); + + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// ReduceOp +class NCCLReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Reduce op input should not be NULL"); + PADDLE_ENFORCE( + ctx->HasInput("Communicator"), + " Input(Communicator) of Reduce op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Input(X) of Reduce op input should not be NULL"); + + std::string reduction = ctx->Attrs().Get("reduction"); + PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), + "invalid reduction."); + + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// BcastOp +class NCCLBcastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasInput("Communicator"), + " Input(Communicator) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Output(Out) of Bcast op output should not be NULL"); + + int root = ctx->Attrs().Get("root"); + PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); + + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// AllreduceOp +class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of AllReduce op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of AllReduce op"); + AddAttr("reduction", + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); + AddComment(R"DOC( +NCCLAllReduce Operator. + +AllReduce the input tensors. + +)DOC"); + } +}; + +// ReduceOp +class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of Reduce op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of Reduce op"); + AddAttr("reduction", + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); + AddAttr("root", + "(int, default kInvalidGPUId) " + "Root gpu of the parameter. If not, " + "set(platform::kInvalidGPUId). Hashed by name.") + .SetDefault(platform::kInvalidGPUId); + AddComment(R"DOC( +NCCLReduce Operator. + +Reduce the tensors. + +)DOC"); + } +}; + +// BcastOp +class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of BcastSend op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of Bcast"); + AddAttr("root", + "(int, default kInvalidGPUId) " + "Root gpu of the parameter. If not, " + "set(platform::kInvalidGPUId). Hashed by name.") + .SetDefault(platform::kInvalidGPUId); + AddComment(R"DOC( +NCCLBcast Operator. + +Bcast the tensors. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp, + paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker); + +REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, + ops::NCCLAllReduceOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp, + ops::NCCLBcastOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp, + ops::NCCLReduceOpMaker); diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..333aed2903e7873aa799bd34468b2e05ef2e556c --- /dev/null +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenseshashernless required by applicable law or agreed +to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::Communicator; +using framework::LoDTensor; + +template +class NCCLTypeWrapper; + +template <> +class NCCLTypeWrapper { + public: + static const ncclDataType_t type = ncclFloat; +}; + +template <> +class NCCLTypeWrapper { + public: + static const ncclDataType_t type = ncclDouble; +}; + +template +class NCCLAllReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + auto ins = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + + std::string reduction = ctx.Attr("reduction"); + ncclRedOp_t reduction_op_ = ncclSum; + + if (reduction == "ncclMin") { + reduction_op_ = ncclMin; + } else if (reduction == "ncclMax") { + reduction_op_ = ncclMax; + } else if (reduction == "ncclSum") { + reduction_op_ = ncclSum; + } else if (reduction == "ncclProd") { + reduction_op_ = ncclProd; + } else { + PADDLE_THROW("Invalid reduction. default ncclSum."); + } + + auto* comm = ctx.Input("Communicator"); + + auto stream = ctx.cuda_device_context().stream(); + + // device id + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(gpu_id); + + for (size_t i = 0; i < ins.size(); ++i) { + VLOG(1) << "gpu : " + << " invoke allreduce. send " << ins[i]->numel() << " recv " + << outs[i]->numel(); + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + ins[i]->data(), outs[i]->mutable_data(ctx.GetPlace()), + outs[i]->numel(), NCCLTypeWrapper::type, reduction_op_, + comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " + << " finished allreduce. send " << ins[i]->numel() << " recv " + << outs[i]->numel(); + } + } +}; + +template +class NCCLReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + auto ins = ctx.MultiInput("X"); // x0, x1, x2 + auto outs = ctx.MultiOutput("Out"); + + std::string reduction = ctx.Attr("reduction"); + ncclRedOp_t reduction_op_ = ncclSum; + + if (reduction == "ncclMin") { + reduction_op_ = ncclMin; + } else if (reduction == "ncclMax") { + reduction_op_ = ncclMax; + } else if (reduction == "ncclSum") { + reduction_op_ = ncclSum; + } else if (reduction == "ncclProd") { + reduction_op_ = ncclProd; + } else { + PADDLE_THROW("Invalid reduction. default ncclSum."); + } + + int root = ctx.Attr("root"); + auto* comm = ctx.Input("Communicator"); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + // device id + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(gpu_id); + + auto ins_names = ctx.Inputs("X"); + std::hash hasher; + for (size_t i = 0; i < ins.size(); ++i) { + if (root == platform::kInvalidGPUId) { + root = hasher(ins_names[i]) % comm->comms_.size(); + } + T* recvbuffer = nullptr; + if (root == gpu_id) { + recvbuffer = outs[i]->mutable_data(ctx.GetPlace()); + } + + VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send " + << ins[i]->numel() << " recv " << outs[i]->numel(); + + PADDLE_ENFORCE(platform::dynload::ncclReduce( + ins[i]->data(), recvbuffer, ins[i]->numel(), + NCCLTypeWrapper::type, reduction_op_, root, comm->comms_[idx], + stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " << gpu_id << " finished reduce. send " + << ins[i]->numel() << " recv " << outs[i]->numel(); + } + } +}; + +template +class NCCLBcastKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + int root = ctx.Attr("root"); + + auto* comm = ctx.Input("Communicator"); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + // device id + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(gpu_id); + + if (idx == root) { + auto ins = ctx.MultiInput("X"); + for (size_t i = 0; i < ins.size(); ++i) { + VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send " + << ins[i]->numel(); + + VLOG(1) << " before ncclBcast"; + PADDLE_ENFORCE(platform::dynload::ncclBcast( + (void*)ins[i]->data(), ins[i]->numel(), NCCLTypeWrapper::type, + root, comm->comms_[idx], stream)); + VLOG(1) << " after ncclBcast"; + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " << gpu_id << " finished Bcast."; + } + } else { + auto outs = ctx.MultiOutput("Out"); + for (size_t i = 0; i < outs.size(); ++i) { + VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " + << framework::product(outs[i]->dims()); + + PADDLE_ENFORCE(platform::dynload::ncclBcast( + outs[i]->mutable_data(ctx.GetPlace()), outs[i]->numel(), + NCCLTypeWrapper::type, root, comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv " + << outs[i]->numel(); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel); +REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel); diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..212ed2f9b63de5061dc3a2eb86508d8a4c305f89 --- /dev/null +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -0,0 +1,318 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +USE_NO_KERNEL_OP(ncclInit); +USE_CUDA_ONLY_OP(ncclAllReduce); +USE_CUDA_ONLY_OP(ncclReduce); +USE_CUDA_ONLY_OP(ncclBcast); + +namespace f = paddle::framework; +namespace p = paddle::platform; + +static std::vector gpu_list; + +// test data amount +const f::DDim kDims = {100, 100}; + +// nccl op common tester, init communicator. +class NCCLTester : public ::testing::Test { + public: + virtual void SetUp() override { + paddle::platform::CPUPlace cpu_place; + for (size_t i = 0; i < gpu_list.size(); ++i) { + p::CUDAPlace place(i); + dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); + } + + NCCLInitOp(); + } + + virtual void TearDown() override { + for (auto &device_context : dev_ctxs) { + delete device_context; + } + } + + void NCCLInitOp() { + paddle::platform::CPUPlace cpu_place; + std::unique_ptr op1(new f::OpDesc); + + op1->SetType("ncclInit"); + op1->SetOutput("Communicator", {"comm"}); + op1->SetAttr("gpus", {gpu_list}); + + auto *var = g_scope.Var("comm"); + var->GetMutable(); + + auto op = f::OpRegistry::CreateOp(*op1); + VLOG(1) << "invoke NCCLInitOp."; + op->Run(g_scope, cpu_place); + VLOG(1) << "NCCLInitOp finished."; + } + + template + void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) { + std::unique_lock lk(mu); + const f::OpDesc *op1 = &op_desc; + + p::CUDAPlace place(gpu_id); + auto &ctx = dev_ctxs.at(gpu_id); + + auto *send_tensor = scope->Var("st")->GetMutable(); + auto *recv_tensor = scope->Var("rt")->GetMutable(); + + if (!send_tensor->numel()) { + send_tensor->Resize(kDims); + send_tensor->mutable_data(kDims, place); + + std::vector send_vector(f::product(kDims), gpu_id); + paddle::framework::CopyFromVector(send_vector, *ctx, send_tensor); + ctx->Wait(); + VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); + } + + lk.unlock(); + + PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims), + "Tensor numel not match!"); + + auto op = f::OpRegistry::CreateOp(*op1); + + VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); + VLOG(1) << " send_tensor : " << send_tensor->numel() + << " recv_tensor : " << recv_tensor->numel(); + op->Run(*scope, place); + VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); + } + + public: + std::vector dev_ctxs; + f::Scope g_scope; + std::mutex mu; +}; + +// ncclInitOp with desc +TEST(NCCL, ncclInitOp) { + std::unique_ptr op_desc(new f::OpDesc); + + op_desc->SetType("ncclInit"); + op_desc->SetOutput("Communicator", {"x1"}); + op_desc->SetAttr("gpus", {gpu_list}); + + f::Scope g_scope; + paddle::platform::CPUPlace cpu_place; + + auto *var = g_scope.Var("x1"); + var->GetMutable(); + + auto op = f::OpRegistry::CreateOp(*op_desc); + VLOG(1) << "invoke NCCLInitOp."; + op->Run(g_scope, cpu_place); + VLOG(1) << "NCCLInitOp finished."; +} + +// ncclAllReduceOp with desc +TEST_F(NCCLTester, ncclAllReduceOp) { + std::unique_ptr op2(new f::OpDesc); + op2->SetType("ncclAllReduce"); + op2->SetInput("X", {"st"}); + op2->SetInput("Communicator", {"comm"}); + op2->SetOutput("Out", {"rt"}); + + std::vector dev_scopes; + + std::vector ths; + + for (size_t i = 0; i < gpu_list.size(); ++i) { + dev_scopes.emplace_back(&g_scope.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + *op2.get(), dev_scopes[i]); + ths.emplace_back(std::move(th)); + } + + for (size_t i = 0; i < gpu_list.size(); ++i) { + ths[i].join(); + } + + // check results + float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + + for (size_t i = 0; i < dev_scopes.size(); ++i) { + p::CPUPlace cpu_place; + p::CUDAPlace gpu_place(gpu_list[i]); + + auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get(); + auto *rt = recv_tensor.data(); + auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable(); + result_tensor->Resize(kDims); + auto *ct = result_tensor->mutable_data(cpu_place); + + paddle::memory::Copy( + cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, + recv_tensor.numel() * sizeof(float), + static_cast(dev_ctxs[i])->stream()); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], result, 1e-5); + } + } +} + +// ncclReduceOp with desc +TEST_F(NCCLTester, ncclReduceOp) { + std::unique_ptr op2(new f::OpDesc); + const int kRoot = 0; + op2->SetType("ncclReduce"); + op2->SetInput("X", {"st"}); + op2->SetInput("Communicator", {"comm"}); + op2->SetOutput("Out", {"rt"}); + op2->SetAttr("root", kRoot); + + std::vector dev_scopes; + + std::vector ths; + + for (size_t i = 0; i < gpu_list.size(); ++i) { + dev_scopes.emplace_back(&g_scope.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + *op2.get(), dev_scopes[i]); + ths.emplace_back(std::move(th)); + } + + for (size_t i = 0; i < gpu_list.size(); ++i) { + ths[i].join(); + } + + // check results on + float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + + p::CPUPlace cpu_place; + p::CUDAPlace gpu_place(gpu_list[kRoot]); + + auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get(); + auto *rt = recv_tensor.data(); + auto *result_tensor = + dev_scopes[kRoot]->Var("ct")->GetMutable(); + result_tensor->Resize(kDims); + auto *ct = result_tensor->mutable_data(cpu_place); + + paddle::memory::Copy( + cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, + recv_tensor.numel() * sizeof(float), + static_cast(dev_ctxs[kRoot])->stream()); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], result, 1e-5); + } +} + +// ncclBcastOp with desc +TEST_F(NCCLTester, ncclBcastOp) { + std::unique_ptr op2(new f::OpDesc); + const int kRoot = 0; + op2->SetType("ncclBcast"); + op2->SetInput("X", {"st"}); + op2->SetInput("Communicator", {"comm"}); + op2->SetOutput("Out", {"rt"}); + op2->SetAttr("root", kRoot); + + std::vector dev_scopes; + + std::vector ths; + + for (size_t i = 0; i < gpu_list.size(); ++i) { + dev_scopes.emplace_back(&g_scope.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + *op2.get(), dev_scopes[i]); + ths.emplace_back(std::move(th)); + } + + for (size_t i = 0; i < gpu_list.size(); ++i) { + ths[i].join(); + } + + const int idx = 1; + // check results on + float result = kRoot; + + p::CPUPlace cpu_place; + p::CUDAPlace gpu_place(gpu_list[idx]); + + auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get(); + auto *rt = recv_tensor.data(); + auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable(); + result_tensor->Resize(kDims); + auto *ct = result_tensor->mutable_data(cpu_place); + + paddle::memory::Copy( + cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, + recv_tensor.numel() * sizeof(float), + static_cast(dev_ctxs[idx])->stream()); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], result, 1e-5); + } +} + +int main(int argc, char **argv) { + // FIXME(tonyyang-svail): + // Due to the driver issue on our CI, disable for now + return 0; + const int dev_count = p::GetCUDADeviceCount(); + if (dev_count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << dev_count; + return 0; + } + + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::CUDAPlace(i)); + gpu_list.emplace_back(i); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + + // device context should be release before scope. + // otherwise driver will down. + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0841313a1042ea4099473fdc0293d9bae4a7c8c3 --- /dev/null +++ b/paddle/fluid/operators/nce_op.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/nce_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class NCEOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input")); + PADDLE_ENFORCE(ctx->HasInput("Label")); + PADDLE_ENFORCE(ctx->HasInput("Weight")); + PADDLE_ENFORCE(ctx->HasOutput("Cost")); + PADDLE_ENFORCE(ctx->HasOutput("SampleLogits")); + PADDLE_ENFORCE(ctx->HasOutput("SampleLabels")); + + auto x_dims = ctx->GetInputDim("Input"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); + int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0], + ctx->GetInputDim("Bias")[0]); + } + auto num_neg_samples = ctx->Attrs().Get("num_neg_samples"); + auto num_total_classes = ctx->Attrs().Get("num_total_classes"); + std::vector custom_neg_classes = + ctx->Attrs().Get>("custom_neg_classes"); + PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]); + if (custom_neg_classes.size() > 0) { + PADDLE_ENFORCE_EQ(custom_neg_classes.size(), + static_cast(num_neg_samples)); + } + // set dims of output(Out) + std::vector out_dims; + out_dims.push_back(x_dims[0]); + out_dims.push_back(1); + ctx->SetOutputDim("Cost", framework::make_ddim(out_dims)); + + // set dims of output(SampleOut) + std::vector sample_out_dims; + sample_out_dims.push_back(x_dims[0]); + sample_out_dims.push_back(num_neg_samples + num_true_classes); + ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims)); + ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.GetPlace()); + } +}; + +class NCEOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim]."); + AddInput( + "Label", + "(Tensor) A tensor of shape [batch_size, num_true_class]. " + "'num_true_class' is the number of target classes in each sample." + "The number of target classes per sample should be same. " + "If you have a variable number of target classes, " + "you can pad them out to a constant number by either repeating them" + " or by padding with an otherwise unused class.)"); + AddInput("Weight", + "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the " + "total number of class."); + AddInput( + "Bias", + "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total " + "number of class. It is a dispensable input.") + .AsDispensable(); + AddInput("SampleWeight", + "(Tensor) A tensor of shape [batch_size, 1] storing a weight for " + "each sample. And it is a dispensable input. The default value of " + "sample is 1.") + .AsDispensable(); + AddOutput("Cost", + "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples."); + AddOutput("SampleLogits", + "An intermediate tensor of shape[batch_size, num_neg_samples + " + "num_pos_samples]." + "This tensor is output of forward kernel and used in backward " + "kernel to compute grads." + "Given X is the dot product of input tensor and sampled labels' " + "weights." + "Then 'SampleLogits' is sigmoid(X).") + .AsIntermediate(); + AddOutput("SampleLabels", + "An intermediate tensor of shape[batch_size, num_neg_samples + " + "num_pos_samples]." + "This tensor is output of forward kernel and used in backward " + "kernel to compute grads." + "") + .AsIntermediate(); + AddAttr("num_total_classes", + "Total number of classes in all samples."); + AddAttr("num_neg_samples", + "The number of negative classes. The default value is 10.") + .SetDefault(10); + AddAttr>("custom_neg_classes", + "This attribute only be used in unitest. Classes " + "in this list wiil be used as negative classes " + "for every samples. Under normal conditions, " + "user should avoid setting this attribute.") + .SetDefault({}); + AddComment(R"DOC( +Compute and return the noise-contrastive estimation training loss. +See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). +By default this operator uses a uniform distribution for sampling. +)DOC"); + } +}; + +class NCEOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input")); + PADDLE_ENFORCE(ctx->HasInput("Weight")); + PADDLE_ENFORCE(ctx->HasInput("Cost")); + PADDLE_ENFORCE(ctx->HasInput("SampleLogits")); + PADDLE_ENFORCE(ctx->HasInput("SampleLabels")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")), + "The input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("Input"); + auto x_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto w_dims = ctx->GetInputDim("Weight"); + auto w_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(w_grad_name)) { + ctx->SetOutputDim(w_grad_name, w_dims); + } + + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) { + auto bias_dims = ctx->GetInputDim("Bias"); + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad); +REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel, + ops::NCEKernel); +REGISTER_OP_CPU_KERNEL(nce_grad, + ops::NCEGradKernel, + ops::NCEGradKernel); diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h new file mode 100644 index 0000000000000000000000000000000000000000..624c2d9bbd3245a11c8cfff2dd7cae6e5b25f106 --- /dev/null +++ b/paddle/fluid/operators/nce_op.h @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "unsupported/Eigen/CXX11/Tensor" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +void PrepareSamples(const framework::ExecutionContext& context) { + auto label = context.Input("Label"); + const int64_t* label_data = label->data(); + auto label_dims = label->dims(); + int num_total_classes = context.Attr("num_total_classes"); + // for unitest + std::vector custom_neg_classes = + context.Attr>("custom_neg_classes"); + // random machine + std::random_device rd; + std::mt19937 rng(rd()); + std::uniform_int_distribution rand(0, num_total_classes - 1); + + auto sample_labels = context.Output("SampleLabels"); + auto sample_labels_dims = sample_labels->dims(); + int64_t* sample_labels_data = + sample_labels->mutable_data(context.GetPlace()); + + int num_label = label_dims.size() == 2 ? label_dims[1] : 1; + int index = 0; + for (int64_t i = 0; i < label_dims[0]; ++i) { + int j = 0; + for (; j < num_label; ++j) { + sample_labels_data[index++] = label_data[i * num_label + j]; + } + if (custom_neg_classes.size() > 0) { + for (auto label : custom_neg_classes) { + sample_labels_data[index++] = label; + } + } else { + for (; j < sample_labels_dims[1]; ++j) { + // TODO(wanghaoshuang): support more distribution sampling + sample_labels_data[index++] = rand(rng); + } + } + } +} + +template +class NCEKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PrepareSamples(context); + auto sample_labels = context.Output("SampleLabels"); + const int64_t* sample_labels_data = sample_labels->data(); + auto sample_out = context.Output("SampleLogits"); + T* sample_out_data = sample_out->mutable_data(context.GetPlace()); + auto label = context.Input("Label"); + auto sample_weight = context.Input("SampleWeight"); + const T* sample_weight_data = nullptr; + if (sample_weight != nullptr) { + sample_weight_data = sample_weight->data(); + } + auto out = context.Output("Cost"); + T* out_data = out->mutable_data(context.GetPlace()); + int num_neg_samples = context.Attr("num_neg_samples"); + int num_total_classes = context.Attr("num_total_classes"); + int64_t num_true_class = 1; + if (label != nullptr) { + num_true_class = label->dims()[1]; + } + T b = 1. / num_total_classes * num_neg_samples; + // forward bias + auto bias = context.Input("Bias"); + if (bias != nullptr) { + const T* bias_data = bias->data(); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + sample_out_data[i] = bias_data[sample_labels_data[i]]; + } + } else { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + sample_out_data[i] = 0; + } + } + // forward mul + auto input_mat = EigenMatrix::From(*(context.Input("Input"))); + auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } + // forward cost + for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { + int64_t j = 0; + out_data[i] = 0; + T w = sample_weight == nullptr ? 1. : sample_weight_data[i]; + // for true classes + for (; j < num_true_class; ++j) { + T o = sample_out_data[i * sample_out->dims()[1] + j]; + T cost = -log(o / (o + b)); + out_data[i] += w * cost; + } + // for sampled neg classes + for (; j < sample_labels->dims()[1]; ++j) { + T o = sample_out_data[i * sample_out->dims()[1] + j]; + T cost = -log(b / (o + b)); + out_data[i] += w * cost; + } + } + } +}; + +template +class NCEGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto d_out = context.Input(framework::GradVarName("Cost")); + const T* d_out_data = d_out->data(); + auto label = context.Input("Label"); + auto sample_out = context.Input("SampleLogits"); + const T* sample_out_data = sample_out->data(); + auto sample_labels = context.Input("SampleLabels"); + const int64_t* sample_labels_data = sample_labels->data(); + auto sample_weight = context.Input("SampleWeight"); + const T* sample_weight_data = nullptr; + if (sample_weight != nullptr) { + sample_weight_data = sample_weight->data(); + } + int num_neg_samples = context.Attr("num_neg_samples"); + int num_total_classes = context.Attr("num_total_classes"); + int num_true_class = 1; + if (label != nullptr) { + num_true_class = label->dims()[1]; + } + T b = 1. / num_total_classes * num_neg_samples; + Tensor sample_grad; // tmp tensor + T* sample_grad_data = + sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); + // backward cost + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + T o = sample_out_data[i]; + T w = sample_weight == nullptr + ? 1 + : sample_weight_data[i / sample_labels->dims()[1]]; + sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class + ? w * (b / (o + b)) * (o - 1) + : w * (o * (1 - o) / (o + b)); + sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]]; + } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T* d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + // get d_w + auto d_w = context.Output(framework::GradVarName("Weight")); + if (d_w != nullptr) { + auto d_w_data = d_w->mutable_data(context.GetPlace()); + std::fill(d_w_data, d_w_data + d_w->numel(), 0.0); + auto d_w_matrix = EigenMatrix::From(*d_w); + auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_w_matrix.chip(sample_labels_data[i], 0) += + x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) * + sample_grad_data[i]; + } + } + // get d_x + auto d_x = context.Output(framework::GradVarName("Input")); + if (d_x != nullptr) { + auto* d_x_data = d_x->mutable_data(context.GetPlace()); + std::fill(d_x_data, d_x_data + d_x->numel(), 0.0); + auto d_x_matrix = EigenMatrix::From(*d_x); + auto w_matrix = EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) += + w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c0ca5873adcc92a39f20a162796e7581ea10c63f --- /dev/null +++ b/paddle/fluid/operators/net_op.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/net_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +const char NetOp::kAll[] = "all"; + +void NetOp::CompleteAddOp(bool calc) { + add_op_done_ = true; + if (!calc) return; + std::set input_set; + std::set output_set; + for (auto& op : ops_) { + for (auto& ipt : op->Inputs()) { + for (auto& var_name : ipt.second) { + // If input variable has been in output set, then it will be + // added into intermediate_outputs_. Otherwise, it will be + // added into input set. + if (Contains(output_set, var_name)) { + intermediate_outputs_.insert(var_name); + } else { + input_set.insert(var_name); + } + } + } + + for (auto& opt : op->Outputs()) { + for (auto& var_name : opt.second) { + output_set.insert(var_name); + } + } + } + auto& inputs = inputs_[kAll]; + inputs.reserve(input_set.size()); + std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs)); + auto& outputs = outputs_[kAll]; + outputs.reserve(output_set.size()); + std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs)); +} + +std::string NetOp::DebugStringEx(const framework::Scope* scope) const { + std::ostringstream os; + os << OperatorBase::DebugStringEx(scope) << std::endl; + for (auto& op : ops_) { + std::istringstream is(op->DebugStringEx(scope)); + for (std::string line; std::getline(is, line);) { + os << " " << line << std::endl; + } + } + return os.str(); +} + +bool NetOp::IsNetOp() const { return true; } + +std::vector NetOp::OutputVars(bool has_intermediate) const { + std::vector all; + for (auto& pair : this->outputs_) { + for (auto& var_name : pair.second) { + all.push_back(var_name); + } + } + if (has_intermediate) { + return all; + } + std::vector ret_val; + for (auto& each : all) { + if (!Contains(intermediate_outputs_, each)) { + ret_val.push_back(each); + } + } + return ret_val; +} + +NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + +std::unique_ptr NetOp::Clone() const { + PADDLE_ENFORCE( + add_op_done_, + "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone"); + return std::unique_ptr(new NetOp(*this)); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h new file mode 100644 index 0000000000000000000000000000000000000000..14e5909851c4ac08b5f59c5c193c801827b91234 --- /dev/null +++ b/paddle/fluid/operators/net_op.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +/** + * @brief Network is also a type of Operator + * + * It will manage the operators it has. + * + * Network is the container and controller of a set of operators. + + * A network object knows all Operators belonging to this network. Variables, + * which are inputs and outputs of these operators, are created and managed by a + * hierarchy of Scope objects. + * + * This is the base class of network, all the networks should implement the APIs + * it defines. + */ +class NetOp : public framework::OperatorBase { + public: + static const char kAll[]; + NetOp() + : framework::OperatorBase("plain_net", framework::VariableNameMap{}, + framework::VariableNameMap{}, + framework::AttributeMap{}) {} + + NetOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs); + + NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) { + this->ops_.reserve(o.ops_.size()); + std::transform( + o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_), + [](const std::unique_ptr& op) { + return std::unique_ptr(op->Clone()); + }); + this->CompleteAddOp(); + } + + /** + * @brief Run the network. + * + * Run all the operators with the `scope`, if no scope is provided, default + * scope will be used instead. If no OpContext is provicded, default context + * will be used. + */ + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + for (auto& op : ops_) { + op->Run(scope, place); + } + } + + bool SupportGPU() const override { + for (auto& op : ops_) { + if (!op->SupportGPU()) { + return false; + } + } + return true; + } + + void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); } + + /** + * @brief Add an operator by ptr + */ + void AppendOp(std::unique_ptr op) { + PADDLE_ENFORCE(!add_op_done_, + "Cannot AppendOp when this network is sealed"); + PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); + ops_.push_back(std::move(op)); + } + + void InsertOp(size_t pos, std::unique_ptr op) { + PADDLE_ENFORCE(!add_op_done_, + "Cannot InsertOp when this network is sealed"); + PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); + PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range"); + ops_.insert(ops_.begin() + pos, std::move(op)); + } + + void InsertOp(size_t pos, const framework::OperatorBase& op) { + InsertOp(pos, op.Clone()); + } + + void CompleteAddOp(bool calculate = true); + + std::string DebugStringEx( + const framework::Scope* scope = nullptr) const override; + + bool IsNetOp() const override; + std::vector OutputVars(bool has_intermediate) const override; + + std::unique_ptr Clone() const override; + + std::vector> ops_; + + private: + bool add_op_done_{false}; + std::set intermediate_outputs_; + + template + static bool Contains(T container, KeyType key) { + return container.find(key) != container.end(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc20be0c81763abe2adcf09de858ce51e16d77a6 --- /dev/null +++ b/paddle/fluid/operators/net_op_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/operators/net_op.h" + +#include + +namespace paddle { +namespace operators { +using Scope = framework::Scope; +using DeviceContext = platform::DeviceContext; + +static int run_cnt = 0; + +class TestOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + DEFINE_OP_CLONE_METHOD(TestOp); + void Run(const Scope& scope, const platform::Place& place) const override { + ++run_cnt; + } +}; + +template +void AssertSameVectorWithoutOrder(const std::vector& expected, + const std::vector& actual) { + ASSERT_EQ(expected.size(), actual.size()); + std::unordered_set expected_set; + for (auto& tmp : expected) { + expected_set.insert(tmp); + } + for (auto& act : actual) { + ASSERT_NE(expected_set.end(), expected_set.find(act)); + } +} + +TEST(OpKernel, all) { + auto net = std::make_shared(); + ASSERT_NE(net, nullptr); + + net->AppendOp(std::unique_ptr( + new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, + {{"Out", {"y"}}}, framework::AttributeMap{}))); + net->AppendOp(std::unique_ptr( + new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}}, + {{"Out", {"z"}}}, framework::AttributeMap{}))); + + net->CompleteAddOp(); + AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, + net->Inputs(NetOp::kAll)); + AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll)); + + auto final_outs = net->OutputVars(false); + + ASSERT_EQ(final_outs.size(), 1UL); + ASSERT_EQ(final_outs[0], "z"); +} + +TEST(NetOp, insert_op) { + NetOp net; + auto op1 = std::unique_ptr( + new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, + {{"Out", {"y"}}}, framework::AttributeMap{})); + net.AppendOp(*op1); + net.InsertOp(0, *op1); + ASSERT_EQ(2UL, net.ops_.size()); + net.InsertOp(2, std::move(op1)); + ASSERT_EQ(3UL, net.ops_.size()); +} + +TEST(NetOp, Clone) { + NetOp net; + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty2", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); + net.CompleteAddOp(true); + auto new_net_op = net.Clone(); + ASSERT_NE(new_net_op, nullptr); + ASSERT_TRUE(new_net_op->IsNetOp()); + auto* new_net = static_cast(new_net_op.get()); + ASSERT_EQ(2UL, new_net->ops_.size()); + ASSERT_EQ(new_net->ops_[0]->Type(), "empty"); + ASSERT_EQ(new_net->ops_[1]->Type(), "empty2"); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee85b1a90a85f8c6ec57900c4c7d0dd319a0186a --- /dev/null +++ b/paddle/fluid/operators/norm_op.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/norm_op.h" +namespace paddle { +namespace operators { + +template +class NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of norm operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput("Scale", + "(Tensor) The input tensor of norm operator. " + "The format of input tensor is C * 1."); + AddAttr("epsilon", + "(float, default 1e-10) Constant " + "for numerical stability.") + .SetDefault(1.0e-10f); + AddOutput("Out", + "(Tensor) The output tensor of norm operator." + "N * M." + "M = C * H * W"); + AddComment(R"DOC( + "Input shape: $(N, C, H, W)$ + Scale shape: $(C, 1)$ + Output shape: $(N, C, H, W)$ + Where + forward + $$ + [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}] + $$ + backward + $$ + \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}} + $$ + )DOC"); + } +}; + +class NormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of NormOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of NormOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of NormOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", in_x_dims); + } +}; + +class NormOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker, norm_grad, + ops::NormOpGrad); +REGISTER_OP_CPU_KERNEL( + norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CPU_KERNEL( + norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..438bb3b86e79c526f70a23d3c7f6cc13f72e0463 --- /dev/null +++ b/paddle/fluid/operators/norm_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CUDA_KERNEL( + norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..db74c9b02a74afc6bd0e59da97e64a3e556b97dc --- /dev/null +++ b/paddle/fluid/operators/norm_op.h @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* scale = context.Input("Scale"); + auto* out = context.Output("Out"); + auto epsilon = static_cast(context.Attr("epsilon")); + out->mutable_data(context.GetPlace()); + int batch_size = in_x->dims()[0]; + int channels = in_x->dims()[1]; + int height = in_x->dims()[2]; + int width = in_x->dims()[3]; + int fea_len = height * width; + auto* place = + context.template device_context().eigen_device(); + auto x = + framework::EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); + // get square + framework::Tensor x_square; + x_square.mutable_data(in_x->dims(), context.GetPlace()); + auto x_square_eigen = + framework::EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); + x_square_eigen.device(*place) = x.square(); + auto scale_eigen = + framework::EigenVector::Flatten( + *scale); + for (int n = 0; n < batch_size; ++n) { + framework::Tensor in_x_batch = in_x->Slice(n, n + 1); + auto in_x_batch_eigen = + framework::EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor x_square_batch = x_square.Slice(n, n + 1); + auto x_square_batch_eigen = + framework::EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor out_batch = out->Slice(n, n + 1); + auto out_batch_eigen = + framework::EigenMatrix::From( + out_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor tmp_tensor; + tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto tmp = framework::EigenVector::Flatten(tmp_tensor); + // get colsum and sqrt , inverse + auto dim = Eigen::array({{0}}); + tmp.device(*place) = x_square_batch_eigen.sum(dim); + tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); + Eigen::array broadcast_dim_col; + broadcast_dim_col[1] = 1; + broadcast_dim_col[0] = channels; + out_batch_eigen.device(*place) = + in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col)); + Eigen::array broadcast_dim_row; + broadcast_dim_row[1] = fea_len; + broadcast_dim_row[0] = 1; + out_batch_eigen.device(*place) = + out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); + } + } +}; +template +class NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* scale = context.Input("Scale"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + auto epsilon = static_cast(context.Attr("epsilon")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + in_x_grad->mutable_data(context.GetPlace()); + int batch_size = in_x->dims()[0]; + int channels = in_x->dims()[1]; + int height = in_x->dims()[2]; + int width = in_x->dims()[3]; + int fea_len = height * width; + auto* place = + context.template device_context().eigen_device(); + + auto scale_eigen = + framework::EigenVector::Flatten( + *scale); + auto x = + framework::EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); + // get square + framework::Tensor x_square; + x_square.mutable_data(in_x->dims(), context.GetPlace()); + auto x_square_eigen = + framework::EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); + x_square_eigen.device(*place) = x.square(); + + for (int n = 0; n < batch_size; ++n) { + framework::Tensor in_x_batch = in_x->Slice(n, n + 1); + auto in_x_batch_eigen = + framework::EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1); + auto in_g_batch_eigen = + framework::EigenMatrix::From( + in_g_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor x_square_batch = x_square.Slice(n, n + 1); + auto x_square_batch_eigen = + framework::EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor outg_batch = out_grad->Slice(n, n + 1); + auto outg_batch_eigen = + framework::EigenMatrix::From( + outg_batch, framework::make_ddim({channels, fea_len})); + + framework::Tensor tmp_tensor; + tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto tmp_eigen = + framework::EigenVector::Flatten(tmp_tensor); + auto dim = Eigen::array({{0}}); + tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim); + framework::Tensor norm_tmp_tensor; + norm_tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto norm_tmp_eigen = + framework::EigenVector::Flatten(norm_tmp_tensor); + norm_tmp_eigen.device(*place) = + (x_square_batch_eigen.sum(dim) + epsilon).sqrt(); + Eigen::array broadcast_dim_col; + broadcast_dim_col[1] = 1; + broadcast_dim_col[0] = channels; + in_g_batch_eigen.device(*place) = + in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col); + in_g_batch_eigen.device(*place) = + in_g_batch_eigen / + (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col); + in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen; + // outg_batch_eigen + (in_g_batch_eigen * -1); + in_g_batch_eigen.device(*place) = + in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col); + Eigen::array broadcast_dim_row; + broadcast_dim_row[1] = fea_len; + broadcast_dim_row[0] = 1; + in_g_batch_eigen.device(*place) = + in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c3a60da729d6f4edb8e7e3aa0c81cd3140855c0 --- /dev/null +++ b/paddle/fluid/operators/one_hot_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/one_hot_op.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace operators { + +class OneHotOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of OneHotOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of OneHotOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) should be at least 2."); + PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U, + "Last dimension of Input(X) should be 1."); + + int depth = ctx->Attrs().Get("depth"); + + PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth); + + framework::DDim out_dims(x_dims); + out_dims[out_dims.size() - 1] = depth; + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /* --> */ "Out"); + } +}; + +class OneHotOpMaker : public framework::OpProtoAndCheckerMaker { + public: + OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, LoDTensor) Input variable with rank at least 2. " + "The last dimension of X should be 1. Each value of X is an index " + "to indicate the position."); + AddOutput("Out", + "(Tensor, Tensor) Output tensor with same rank as X. " + "The tensor consists of one-hot representations of values in X."); + AddAttr("depth", + "A positive integer to specify the length of one-hot vector."); + AddAttr("dtype", + "An integer to specify the data type of one-hot " + "vector. The default value is FP32.") + .SetDefault(paddle::framework::proto::DataType::FP32); + AddComment(R"DOC( +One Hot Operator. This operator creates the one-hot representations for input +index values. The following example will help to explain the function of this +operator: + +X is a LoDTensor: + X.lod = [[0, 1, 4]] + X.shape = [4, 1] + X.data = [[1], [1], [3], [0]] + +set depth = 4 + +Out is a LoDTensor: + Out.lod = [[0, 1, 4]] + Out.shape = [4, 4] + Out.data = [[0., 1., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 0., 1.], + [1., 0., 0., 0.]] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + one_hot, ops::OneHotKernel, + ops::OneHotKernel); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a8061edaab61661f57b17cd5e065c5c84edb906 --- /dev/null +++ b/paddle/fluid/operators/one_hot_op.cu @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/one_hot_op.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, + const int64_t numel, const int depth) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; + } +} + +template +struct OneHotOpCUDAFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + const DeviceContext& ctx_; + int depth_; + + OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void operator()() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + auto stream = ctx_.stream(); + math::set_constant(ctx_, out_, 0.0); + + FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + p_in_data, p_out_data, numel, depth_); + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + + framework::VisitDataType( + static_cast(context.Attr("dtype")), + OneHotOpCUDAFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + one_hot, ops::OneHotCUDAKernel, + ops::OneHotCUDAKernel); diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ddac6edd0ec73a932dcf9c6ca7d7d63853467f1c --- /dev/null +++ b/paddle/fluid/operators/one_hot_op.h @@ -0,0 +1,68 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct OneHotOpFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + int depth_; + const DeviceContext& ctx_; + + OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void operator()() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + math::set_constant(ctx_, out_, 0.0); + + for (int i = 0; i < numel; ++i) { + PADDLE_ENFORCE_GE(p_in_data[i], 0, + "Illegal index value, should be at least 0."); + PADDLE_ENFORCE_LT(p_in_data[i], depth_, + "Illegal index value, should be less than depth (%d).", + depth_); + *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; + } + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + + framework::VisitDataType( + static_cast(context.Attr("dtype")), + OneHotOpFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/op_documentation/batch_norm_op.md b/paddle/fluid/operators/op_documentation/batch_norm_op.md similarity index 100% rename from paddle/operators/op_documentation/batch_norm_op.md rename to paddle/fluid/operators/op_documentation/batch_norm_op.md diff --git a/paddle/operators/op_documentation/name_convention.md b/paddle/fluid/operators/op_documentation/name_convention.md similarity index 100% rename from paddle/operators/op_documentation/name_convention.md rename to paddle/fluid/operators/op_documentation/name_convention.md diff --git a/paddle/operators/op_documentation/net_op_design.md b/paddle/fluid/operators/op_documentation/net_op_design.md similarity index 100% rename from paddle/operators/op_documentation/net_op_design.md rename to paddle/fluid/operators/op_documentation/net_op_design.md diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/fluid/operators/op_documentation/op_markdown_format.md similarity index 100% rename from paddle/operators/op_documentation/op_markdown_format.md rename to paddle/fluid/operators/op_documentation/op_markdown_format.md diff --git a/paddle/operators/op_documentation/rnn_design.md b/paddle/fluid/operators/op_documentation/rnn_design.md similarity index 100% rename from paddle/operators/op_documentation/rnn_design.md rename to paddle/fluid/operators/op_documentation/rnn_design.md diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b021fde7cba699327e1874d569a14c3139a4c32 --- /dev/null +++ b/paddle/fluid/operators/pad_op.cc @@ -0,0 +1,140 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pad_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class PadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PadOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto paddings = ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), + "Size of paddings should be equal to 2 * dimension size " + "of input tensor."); + std::vector out_dims(x_dim.size()); + for (int i = 0; i < x_dim.size(); ++i) { + out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (out_dims[0] == x_dim[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } +}; + +class PadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PadOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddOutput("Out", + "The output of pad op. " + "A tensor with the same shape as X."); + AddAttr>( + "paddings", + "(vector) " + "A list to describe the padding rules for each dimension. " + "For 2-D image tensor, paddings=[0, 1, 2, 3] means " + "padding 0 row to top, 1 row to bottom, 2 columns to left " + "and 3 columns to right. Size of paddings should be equal to " + "2 * dimension size of the input tensor."); + AddAttr("pad_value", + "(float, default 0.0) " + "The value to fill the padded areas.") + .SetDefault(0.0f); + AddComment(R"DOC( +Pad Operator. + +Pad input into output, as specified by paddings and pad_value. +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Given: + +X = [[1, 2], + [3, 4]], + +paddings = [0, 1, 1, 2], + +and + +pad_value = 0, + +we have: + +Out = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]] + +)DOC"); + } +}; + +class PadOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +class PadOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* bind = new framework::OpDesc(); + bind->SetInput("X", Input("X")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); + bind->SetAttrMap(Attrs()); + bind->SetType("pad_grad"); + return std::unique_ptr(bind); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker); +REGISTER_OPERATOR(pad_grad, ops::PadOpGrad); +REGISTER_OP_CPU_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CPU_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..203c31440371440b5e942452dab08978e2136275 --- /dev/null +++ b/paddle/fluid/operators/pad_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/pad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CUDA_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..244d8f9b6cf51ab249e991ed129e99da67ff9e62 --- /dev/null +++ b/paddle/fluid/operators/pad_op.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenTensor = framework::EigenTensor; + +template +void PadFunction(const framework::ExecutionContext& context) { + auto pads = context.Attr>("paddings"); + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = pads[i * 2]; + paddings[i].second = pads[i * 2 + 1]; + } + T pad_value = context.Attr("pad_value"); + + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = x_tensor.pad(paddings, pad_value); +} + +template +class PadKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + PadFunction(context); + break; + case 2: + PadFunction(context); + break; + case 3: + PadFunction(context); + break; + case 4: + PadFunction(context); + break; + case 5: + PadFunction(context); + break; + case 6: + PadFunction(context); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } + } +}; + +template +void PadGradFunction(const framework::ExecutionContext& context) { + auto pads = context.Attr>("paddings"); + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = -pads[i * 2]; + paddings[i].second = -pads[i * 2 + 1]; + } + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + d_x->mutable_data(context.GetPlace()); + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + auto& place = + *context.template device_context().eigen_device(); + d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); + } +} + +template +class PadGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + PadGradFunction(context); + break; + case 2: + PadGradFunction(context); + break; + case 3: + PadGradFunction(context); + break; + case 4: + PadGradFunction(context); + break; + case 5: + PadGradFunction(context); + break; + case 6: + PadGradFunction(context); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e25df92479943d210d98f02374f377f778f43d2c --- /dev/null +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -0,0 +1,378 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +static constexpr char kInputs[] = "inputs"; +static constexpr char kParameters[] = "parameters"; +static constexpr char kPlaces[] = "places"; + +static constexpr char kOutputs[] = "outputs"; +static constexpr char kParallelScopes[] = "parallel_scopes"; + +static constexpr char kParallelBlock[] = "sub_block"; + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +static void SplitTensorAndMoveTensorToScopes( + const framework::Scope &scope, std::vector *sub_scopes, + const std::vector &places, + const std::vector &names) { + size_t num_sub_scopes = 0; + for (auto &argu : names) { + const auto &tensor = + detail::Ref(scope.FindVar(argu), + "Cannot find variable %s in the parent scope", argu) + .Get(); + auto lod_tensors = tensor.SplitLoDTensor(places); + + for (auto &lod : lod_tensors) { + VLOG(3) << lod.dims(); + } + if (num_sub_scopes == 0) { + num_sub_scopes = lod_tensors.size(); + } else { + PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size()); + } + PADDLE_ENFORCE_NE(num_sub_scopes, 0); + if (sub_scopes->size() == 0) { + sub_scopes->reserve(num_sub_scopes); + for (size_t i = 0; i < num_sub_scopes; ++i) { + sub_scopes->emplace_back(&scope.NewScope()); + } + } + + for (size_t i = 0; i < lod_tensors.size(); ++i) { + *detail::Ref(sub_scopes->at(i)->Var(argu), + "Cannot find variable in the sub-scope", argu) + .GetMutable() = lod_tensors[i]; + } + } +} + +inline void CopyOrShare(const framework::Variable &src, + const platform::Place &dst_place, + framework::Variable *dst) { + if (src.IsType()) { + if (src.Get().place() == dst_place) { + dst->GetMutable()->ShareDataWith(src.Get()); + dst->GetMutable()->set_lod(src.Get().lod()); + } else { + Copy(src.Get(), dst_place, dst->GetMutable()); + } + } else if (src.IsType()) { + auto &src_sr = src.Get(); + auto *dst_sr = dst->GetMutable(); + dst_sr->set_height(src_sr.height()); + if (src_sr.value().place() == dst_place) { + dst_sr->mutable_value()->ShareDataWith(src_sr.value()); + dst_sr->set_rows(src_sr.rows()); + } else { + Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); + } + } else { + PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); + } +} + +void WaitOnPlace(const platform::Place place) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); +} + +void WaitOnPlaces(const std::vector places) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + + for (auto &place : places) { + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); + } +} + +class ParallelDoOp : public framework::OperatorBase { + public: + ParallelDoOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + auto *block = Attr(kParallelBlock); + auto *program = block->Program(); + + auto &places = scope.FindVar(Input(kPlaces))->Get(); + + auto &sub_scopes = *scope.FindVar(Output(kParallelScopes)) + ->GetMutable>(); + + // split input + SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places, + Inputs(kInputs)); + + // copy parameter + for (auto ¶m : Inputs(kParameters)) { + PADDLE_ENFORCE(scope.FindVar(param)->IsType(), + "Only support parameter type as LoDTensor"); + auto &src = scope.FindVar(param)->Get(); + for (size_t i = 0; i < sub_scopes.size(); ++i) { + auto &place = places[i]; + auto *sub_scope = sub_scopes[i]; + auto *dst = sub_scope->Var(param)->GetMutable(); + framework::Copy(src, place, dst); + } + } + WaitOnPlaces(places); + + std::vector> workers; + workers.reserve(places.size()); + for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { + auto &place = places[place_idx]; + auto *cur_scope = sub_scopes[place_idx]; + + workers.emplace_back(framework::Async([program, cur_scope, place, block] { + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); + } + for (auto &worker : workers) { + worker.wait(); + } + WaitOnPlaces(places); + + // merge output + for (auto &o_name : Outputs(kOutputs)) { + std::vector lod_tensors; + lod_tensors.reserve(sub_scopes.size()); + for (auto *sub_scope : sub_scopes) { + lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get()); + } + + auto *lod_tensor_to_be_merged = + scope.FindVar(o_name)->GetMutable(); + lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace()); + } + WaitOnPlaces(places); + } +}; + +class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInputs, "").AsDuplicable(); + AddInput(kParameters, "").AsDuplicable(); + AddInput(kPlaces, ""); + AddOutput(kOutputs, "").AsDuplicable(); + AddOutput(kParallelScopes, ""); + AddAttr(kParallelBlock, ""); + AddComment(R"DOC( +ParallelDo Operator. +)DOC"); + } +}; + +class ParallelDoGradOp : public framework::OperatorBase { + public: + ParallelDoGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *block = Attr(kParallelBlock); + auto *program = block->Program(); + + auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) + ->Get>(); + + auto &places = scope.FindVar(Input(kPlaces))->Get(); + + // feed output@grad + SplitTensorAndMoveTensorToScopes( + scope, const_cast *>(&sub_scopes), + places, Inputs(framework::GradVarName(kOutputs))); + WaitOnPlaces(places); + + // exe run + std::vector> workers; + for (size_t i = 0; i < sub_scopes.size(); ++i) { + auto &place = places[i]; + auto *cur_scope = sub_scopes[i]; + + // execute + workers.emplace_back(framework::Async([program, cur_scope, place, block] { + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); + } + for (auto &worker : workers) { + worker.wait(); + } + WaitOnPlaces(places); + + AccumulateGrad(scope, place, sub_scopes, places); + } + + void AccumulateGrad(const framework::Scope &scope, + const platform::Place &place, + const std::vector &sub_scopes, + const platform::PlaceList &places) const { + for (auto &s : Outputs(framework::GradVarName(kParameters))) { + VLOG(3) << "Accumulating " << s; + if (s == framework::kEmptyVarName) continue; + std::string tmp_name; + auto *tmp = sub_scopes[0]->Var(&tmp_name); + + for (size_t i = 1; i < sub_scopes.size(); ++i) { + CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp); + WaitOnPlaces(places); + + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, + framework::AttributeMap{}); + VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); + sum_op->Run(*sub_scopes[0], places[0]); + WaitOnPlace(places[0]); + } + + CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); + } + WaitOnPlaces(places); + } +}; + +std::ostream &operator<<(std::ostream &sout, + const std::vector &strs) { + std::copy(strs.begin(), strs.end(), + std::ostream_iterator(sout, ",")); + return sout; +} + +class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + virtual std::unique_ptr Apply() const { + auto *grad = new framework::OpDesc(); + grad->SetType("parallel_do_grad"); + for (auto &input_param : this->InputNames()) { + VLOG(3) << input_param; + grad->SetInput(input_param, this->Input(input_param)); + if (input_param != kPlaces) { + grad->SetOutput(framework::GradVarName(input_param), + this->InputGrad(input_param, false)); + } + } + auto *g_block = this->grad_block_[0]; + + // All variable name that needed by gradient operators + std::unordered_set all_inputs_in_grad_blocks; + + for (size_t i = 0; i < g_block->OpSize(); ++i) { + auto *op = g_block->Op(i); + for (auto &var_name : op->InputArgumentNames()) { + all_inputs_in_grad_blocks.insert(var_name); + } + } + + for (auto &output_param : this->OutputNames()) { + if (output_param == kParallelScopes) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(framework::GradVarName(output_param), + this->Output(output_param)); + } else { + grad->SetInput(output_param, this->Output(output_param)); + std::vector og_names; + for (auto &og_name : this->OutputGrad(output_param)) { + if (all_inputs_in_grad_blocks.count(og_name) != 0) { + // there are some gradient operators who need the OG. So make this + // OG as an input of parallel.do + og_names.push_back(og_name); + } + // else, there is no operator who need the OG. Do not use this OG as + // an input + } + grad->SetInput(framework::GradVarName(output_param), og_names); + } + } + grad->SetAttrMap(this->Attrs()); + grad->SetBlockAttr(kParallelBlock, *grad_block_[0]); + + return std::unique_ptr(grad); + } +}; + +class ParallelDoGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs(kParameters)); + PADDLE_ENFORCE(ctx->HasInputs(kInputs)); + PADDLE_ENFORCE(ctx->HasInputs(kOutputs)); + + ctx->SetOutputsDim(framework::GradVarName(kParameters), + ctx->GetInputsDim(kParameters)); + + auto i_dims = ctx->GetInputsDim(kInputs); + auto ig_names = ctx->Outputs(framework::GradVarName(kInputs)); + + for (size_t i = 0; i < ig_names.size(); ++i) { + auto &ig_name = ig_names[i]; + if (ig_name == framework::kEmptyVarName) { + continue; + } + + ctx->SetDims({ig_name}, {i_dims[i]}); + } + + auto p_dims = ctx->GetInputsDim(kParameters); + auto pg_names = ctx->Outputs(framework::GradVarName(kParameters)); + for (size_t i = 0; i < pg_names.size(); ++i) { + auto &pg_name = pg_names[i]; + if (pg_name == framework::kEmptyVarName) { + continue; + } + ctx->SetDims({pg_name}, {p_dims[i]}); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, + paddle::operators::ParallelDoOpProtoMaker, + paddle::operators::ParallelDoGradOpDescMaker); +REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, + paddle::operators::ParallelDoGradOpShapeInference); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..75984b7721c48526c6d11c4b82004dba1c166cc4 --- /dev/null +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; +using DataLayout = platform::DataLayout; +using PoolingMode = platform::PoolingMode; + +template +class PoolCUDNNOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + const Tensor *input = ctx.Input("X"); + Tensor *output = ctx.Output("Out"); + + const T *input_data = input->data(); + T *output_data = output->mutable_data(ctx.GetPlace()); + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = PoolingMode::kAverage; + } + + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, ksize, paddings, strides); + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cuda_device_context().cudnn_handle(); + T alpha = 1.0f, beta = 0.0f; + + PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( + handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, + cudnn_output_desc, output_data)); + } +}; + +template +class PoolCUDNNGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + const Tensor *input = ctx.Input("X"); + const Tensor *output = ctx.Input("Out"); + const Tensor *output_grad = + ctx.Input(framework::GradVarName("Out")); + Tensor *input_grad = ctx.Output(framework::GradVarName("X")); + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + const T *input_data = input->data(); + const T *output_data = output->data(); + const T *output_grad_data = output_grad->data(); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = PoolingMode::kAverage; + } + + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, ksize, paddings, strides); + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cuda_device_context().cudnn_handle(); + T alpha = 1.0f, beta = 0.0f; + + if (input_grad) { + T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + + PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward( + handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, + cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, + &beta, cudnn_input_desc, input_grad_data)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNGradOpKernel, + ops::PoolCUDNNGradOpKernel); + +REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNGradOpKernel, + ops::PoolCUDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dd33eefc5fd18ba08dce5ea8dff791cda54332c --- /dev/null +++ b/paddle/fluid/operators/pool_op.cc @@ -0,0 +1,306 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" + +namespace paddle { +namespace operators { + +int OutputSizePool(int input_size, int filter_size, int padding, int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +void PoolOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of Pooling should not be null."); + + auto in_x_dims = ctx->GetInputDim("X"); + + std::string pooling_type = ctx->Attrs().Get("pooling_type"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, + "Pooling intput should be 4-D or 5-D tensor."); + + if (ctx->Attrs().Get("global_pooling")) { + ksize.resize(static_cast(in_x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x_dims[i + 2]); + } + } + + PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, + "Input size and pooling size should be consistent."); + PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), + "Strides size and pooling size should be the same."); + PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), + "Paddings size and pooling size should be the same."); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->ShareLoD("X", "Out"); +} + +framework::OpKernelType PoolOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto &dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); +} + +void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); +} + +framework::OpKernelType PoolOpGrad::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto &dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); +} + +Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the feature, " + "and W is the width of the feature."); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator. " + "The format of output tensor is also NCHW, " + "where N is batch size, C is the number of channels, " + "H is the height of the feature, " + "and W is the width of the feature."); + + AddAttr("pooling_type", + "(string), pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddAttr>("ksize", + "(vector) The pooling window " + "size(height, width) of the pooling operator. " + "If global_pooling = true, ksize and paddings will " + "be ignored."); // TODO(Chengduo): Add checker. + // (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr("global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault(false); + AddAttr>("strides", + "(vector, default {1, 1}), strides(height, " + "width) of pooling operator.") + .SetDefault({1, 1}); + // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default {0,0}), paddings(height, width) of pooling " + "operator." + "If global_pooling = true, paddings and ksize will be ignored.") + .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + + AddComment(R"DOC( +Pool2d Operator. + +The pooling2d operation calculates the output based on +the input, pooling_type and ksize, strides, paddings parameters. +Input(X) and output(Out) are in NCHW format, where N is batch size, C is the +number of channels, H is the height of the feature, and W is the width of the feature. +Parameters(ksize, strides, paddings) are two elements. +These two elements represent height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + X shape: $(N, C, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, H_{out}, W_{out})$ + Where + $$ + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 + $$ + +)DOC"); +} + +Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCDHW, where N is batch size, C is " + "the number of channels, and D, H and W is the depth, height and " + "width of " + "the feature, respectively."); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator." + "The format of output tensor is also NCDHW, " + "where N is batch size, C is " + "the number of channels, and D, H and W is the depth, height and " + "width of the feature, respectively."); + + AddAttr("pooling_type", + "(string) Pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddAttr>( + "ksize", + "(vector) The pooling window size(depth, height, " + "width) of pooling operator. " + "If global_pooling = true, ksize and paddings will " + "be ignored."); // TODO(Chengduo): Add checker. + // (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings wille be ignored.") + .SetDefault(false); + AddAttr>( + "strides", + "(vector, default {1,1,1}) Strides(depth, height, " + "width) of the pooling operator.") + .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default {0,0,0}), paddings(depth, height, " + "width) of pooling operator. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + + AddComment(R"DOC( +Pool3d Operator. + +The pooling3d operation calculates the output based on +the input, pooling_type, ksize, strides, and paddings parameters. +Input(X) and output(Out) are in NCDHW format, where N is batch +size, C is the number of channels, and D, H and W are the depth, height and +width of the feature, respectively. Parameters(ksize, strides, paddings) +are three elements. These three elements represent depth, height and +width, respectively. The input(X) size and output(Out) size may be different. + +Example: + Input: + X shape: $(N, C, D_{in}, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 + $$ + +)DOC"); +} +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad, + ops::PoolOpGrad); + +REGISTER_OP_CPU_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_grad, ops::PoolGradKernel, + ops::PoolGradKernel) + +REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad, + ops::PoolOpGrad); + +REGISTER_OP_CPU_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_grad, ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..14486c07402af387ee11a127ba193ac9ac36c8a2 --- /dev/null +++ b/paddle/fluid/operators/pool_op.cu.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); + +REGISTER_OP_CUDA_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4cabd634d66e402282f17ed8724129a3e6e1ff43 --- /dev/null +++ b/paddle/fluid/operators/pool_op.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class PoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class PoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +template +class PoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + Tensor* out = context.Output("Out"); + + std::string pooling_type = context.Attr("pooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + auto& dev_ctx = context.template device_context(); + switch (ksize.size()) { + case 2: { + if (pooling_type == "max") { + paddle::operators::math::Pool2dFunctor< + DeviceContext, paddle::operators::math::MaxPool, T> + pool2d_forward; + paddle::operators::math::MaxPool pool_process; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + + } else if (pooling_type == "avg") { + paddle::operators::math::Pool2dFunctor< + DeviceContext, paddle::operators::math::AvgPool, T> + pool2d_forward; + paddle::operators::math::AvgPool pool_process; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + } + } break; + case 3: { + if (pooling_type == "max") { + paddle::operators::math::Pool3dFunctor< + DeviceContext, paddle::operators::math::MaxPool, T> + pool3d_forward; + paddle::operators::math::MaxPool pool_process; + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + } else if (pooling_type == "avg") { + paddle::operators::math::Pool3dFunctor< + DeviceContext, paddle::operators::math::AvgPool, T> + pool3d_forward; + paddle::operators::math::AvgPool pool_process; + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + } + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } +}; + +template +class PoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + const Tensor* out = context.Input("Out"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + + std::string pooling_type = context.Attr("pooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + auto& dev_ctx = context.template device_context(); + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + paddle::operators::math::SetConstant set_constant; + set_constant(dev_ctx, in_x_grad, 0.0); + + switch (ksize.size()) { + case 2: { + if (pooling_type == "max") { + paddle::operators::math::MaxPool2dGradFunctor + pool2d_backward; + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); + } else if (pooling_type == "avg") { + paddle::operators::math::Pool2dGradFunctor< + DeviceContext, paddle::operators::math::AvgPoolGrad, T> + pool2d_backward; + paddle::operators::math::AvgPoolGrad pool_process; + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); + } + } break; + case 3: { + if (pooling_type == "max") { + paddle::operators::math::MaxPool3dGradFunctor + pool3d_backward; + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); + } else if (pooling_type == "avg") { + paddle::operators::math::Pool3dGradFunctor< + DeviceContext, paddle::operators::math::AvgPoolGrad, T> + pool3d_backward; + paddle::operators::math::AvgPoolGrad pool_process; + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); + } + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef6d5d867b2d38b3ca26deb1cbd9f16ca9846d0f --- /dev/null +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -0,0 +1,291 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_with_index_op.h" + +namespace paddle { +namespace operators { + +inline int OutputSizeMaxPool(int input_size, int filter_size, int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +class MaxPoolWithIndexOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Mask"), + "Output(Mask) of Pooling should not be null."); + + auto in_x_dims = ctx->GetInputDim("X"); + + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, + "Pooling intput should be 4-D or 5-D tensor."); + + if (ctx->Attrs().Get("global_pooling")) { + ksize.resize(static_cast(in_x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x_dims[i + 2]); + } + } + + PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, + "Input size and pooling size should be consistent."); + PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), + "Strides size and pooling size should be the same."); + PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), + "Paddings size and pooling size should be the same."); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->SetOutputDim("Mask", framework::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the image, " + "and W is the width of the image."); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator. " + "The format of output tensor is also NCHW, " + "where N is batch size, C is " + "the number of channels, H is the height of the image " + "and W is the width of the image."); + AddOutput("Mask", + "(Tensor) The Mask tensor of pooling operator." + "The format of output tensor is also NCHW, " + "where N is batch size, C is the number of channels, " + "H is the height of the image, " + "and W is the width of the image. " + "It represents the index in the current feature map."); + + AddAttr>("ksize", + "(vector) The pooling window size(height, " + "width) of pooling operator. " + "If global_pooling = true, ksize and paddings " + "will be ignored."); // TODO(Chengduo): Add + // checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "global_pooling", + "(bool, default:false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault(false); + AddAttr>("strides", + "(vector, default {1, 1}), strides(height, " + "width) of pooling operator.") + .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default:{0, 0}), paddings(height, width) of pooling " + "operator. " + "If global_pooling = true, paddings and will be ignored.") + .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddComment(R"DOC( +MaxPool2d Operator. + +The maxPooling2d with index operation calculates the output and the mask +based on the input, ksize, strides, and paddings parameters. Input(X) and +output(Out, Mask) are in NCHW format, where N is batch size, C is the +number of channels, H is the height of the feature, +and W is the width of the feature. +Parameters(ksize, strides, paddings) are two elements. +These two elements represent height and width, respectively. +The input(X) size and output(Out, Mask) size may be different. + +Example: + Input: + X shape: $(N, C, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, H_{out}, W_{out})$ + Mask shape: $(N, C, H_{out}, W_{out})$ + Where + $$ + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 + $$ + +)DOC"); + } +}; + +class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCDHW, where N is batch size, C is " + "the number of channels, and D, H and W are the depth, height and " + "width of " + "the image, respectively"); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator. " + "The format of output tensor is also NCDHW, " + "where N is the batch size, C is the number of channels, " + "and D, H and W are the depth, height and " + "width of the image, respectively."); + AddOutput("Mask", + "(Tensor) The Mask tensor of pooling operator. " + "The format of output tensor is also NCDHW, " + "where N is the batch size, C is the number of channels, and " + "D, H and W are the depth, height and width " + "of the image, respectively. " + "It represents the index in the current feature map."); + + AddAttr>("ksize", + "(vector) The pooling window size(depth, " + "height, width) of pooling operator. " + "If global_pooling = true, ksize and paddings " + "will be ignored."); // TODO(Chengduo): Add + // checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault(false); + AddAttr>("strides", + "(vector, default {1,1,1}), strides(depth, " + "height, width) of pooling operator.") + .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default {0,0,0}), paddings(depth, " + "height, width) of pooling operator. " + "If global_pooling = true, paddings and ksize will be ignored.") + .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddComment(R"DOC( +MaxPool3d Operator. + +The maxpooling3d with index operation calculates the output and the mask +based on the input and ksize, strides, paddings parameters. +Input(X) and output(Out, Mask) are in NCDHW format, where N is batch +size, C is the number of channels, and D, H and W are the depth, height and +width of the feature, respectively. +Parameters(ksize, strides, paddings) are three elements. +These three elements represent depth, height and width, respectively. +The input(X) size and output(Out, Mask) size may be different. + +Example: + Input: + X shape: $(N, C, D_{in}, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ + Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 + $$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp, + ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad, + ops::MaxPoolWithIndexOpGrad); + +REGISTER_OP_CPU_KERNEL( + max_pool2d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CPU_KERNEL( + max_pool2d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) + +REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, + ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad, + ops::MaxPoolWithIndexOpGrad); + +REGISTER_OP_CPU_KERNEL( + max_pool3d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CPU_KERNEL( + max_pool3d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..722a4d1e2a4a4ad5c1268483db46e5e9d5d4a33b --- /dev/null +++ b/paddle/fluid/operators/pool_with_index_op.cu.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_with_index_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + max_pool2d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( + max_pool2d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) + +REGISTER_OP_CUDA_KERNEL( + max_pool3d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( + max_pool3d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h new file mode 100644 index 0000000000000000000000000000000000000000..da7ef9df73a51aabc208521880168144de6f392c --- /dev/null +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MaxPoolWithIndexKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + Tensor* out = context.Output("Out"); + Tensor* mask = context.Output("Mask"); + + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + auto& dev_ctx = context.template device_context(); + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + + switch (ksize.size()) { + case 2: { + paddle::operators::math::MaxPool2dWithIndexFunctor + pool2d_forward; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + } break; + case 3: { + paddle::operators::math::MaxPool3dWithIndexFunctor + pool3d_forward; + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } +}; + +template +class MaxPoolWithIndexGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* mask = context.Input("Mask"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x_grad->dims()[i + 2]); + } + } + + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + auto& device_ctx = context.template device_context(); + math::set_constant(device_ctx, in_x_grad, 0); + + switch (ksize.size()) { + case 2: { + paddle::operators::math::MaxPool2dWithIndexGradFunctor + pool2d_backward; + pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, + paddings, in_x_grad); + } break; + case 3: { + paddle::operators::math::MaxPool3dWithIndexGradFunctor + pool3d_backward; + pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, + paddings, in_x_grad); + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d237da25a00de13057e009b6705d3241b8b26539 --- /dev/null +++ b/paddle/fluid/operators/positive_negative_pair_op.cc @@ -0,0 +1,179 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/positive_negative_pair_op.h" + +namespace paddle { +namespace operators { + +class PositiveNegativePairOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("Score"), + "Input(Score) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("Label"), + "Input(Label) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("QueryID"), + "Input(QueryID) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PositivePair"), + "Output(PositivePair) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegativePair"), + "Output(NegativePair) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NeutralPair"), + "Output(NeutralPair) of PositiveNegativePairOp should not be null."); + auto scalar_dim = framework::make_ddim({1}); + if (ctx->HasInput("AccumulatePositivePair") || + ctx->HasInput("AccumulateNegativePair") || + ctx->HasInput("AccumulateNeutralPair")) { + PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") && + ctx->HasInput("AccumulateNegativePair") && + ctx->HasInput("AccumulateNeutralPair"), + "All optional inputs(AccumulatePositivePair, " + "AccumulateNegativePair, AccumulateNeutralPair) of " + "PositiveNegativePairOp are required if one of them is " + "specified."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim, + "Shape of AccumulatePositivePair should be {1}."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim, + "Shape of AccumulateNegativePair should be {1}."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim, + "Shape of AccumulateNeutralPair should be {1}."); + } + + auto score_dim = ctx->GetInputDim("Score"); + auto label_dim = ctx->GetInputDim("Label"); + auto query_dim = ctx->GetInputDim("QueryID"); + PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor."); + PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + label_dim[0], score_dim[0], + "Tensor Score and Label should have the same height (batch size)."); + PADDLE_ENFORCE_EQ(label_dim[1], 1, + "The width of Label should be 1, i.e. each item should " + "have a scalar label."); + PADDLE_ENFORCE(query_dim == label_dim, + "QueryID should have the same shape as Label."); + if (ctx->HasInput("Weight")) { + PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim, + "Weight should have the same shape as Label."); + } + int column = ctx->Attrs().Get("column"); + auto depth = score_dim[1]; + PADDLE_ENFORCE(column < depth && column >= -depth, + "Attribute column should be in the range of [-%l, %l)", + depth, depth); + + ctx->SetOutputDim("PositivePair", scalar_dim); + ctx->SetOutputDim("NegativePair", scalar_dim); + ctx->SetOutputDim("NeutralPair", scalar_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Score")->type()), + ctx.device_context()); + } +}; + +class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Score", + "(Tensor, float) Model Score on an item (with " + "respect to QueryID). It's a 2-D tensor with shape [batch_size, " + "depth], where the column specified by the attribute \"column\" " + "is used as item score."); + AddInput("Label", + "(Tensor, float) Label of an item (with repsect to " + "QueryId). It's a 2-D tensor with shape [batch_size, 1]."); + AddInput("QueryID", + "(Tensor, int64) Query ID that indicates the context. Its shape " + "should be the same as Label."); + AddInput( + "AccumulatePositivePair", + "(float) Optional. The accumulated number of positive pairs over a " + "stream of data. If provided, the output PositivePair will be " + "initialized with this number rather than 0. it won't be modified " + "in place.") + .AsDispensable(); + AddInput( + "AccumulateNegativePair", + "(float) Optional. The accumulated number of negative pairs over a " + "stream of data. If provided, the output NegativePair will be " + "initialized with this number rather than 0. it won't be modified " + "in place.") + .AsDispensable(); + AddInput("AccumulateNeutralPair", + "(float) Optional. The accumulated number of neutral pairs over a " + "stream of data. If provided, the output NeutralPair will be " + "initialized with this number rather than 0. it won't be modified " + "in place.") + .AsDispensable(); + AddInput("Weight", + "(float) Optional. Weight of current item. If specified, its " + "shape should be the same as Label, and the meaning of the output " + "changes from numbers of pairs to the total sum of pairs' " + "weights. Weight of a pair of items is the average of their " + "weights.") + .AsDispensable(); + AddOutput("PositivePair", + "(float) Number of positive pairs, i.e. the pairs of " + "items that are ranked correctly."); + AddOutput("NegativePair", + "(float) Number of negative pairs, i.e. the pairs of " + "items that are ranked incorrectly."); + AddOutput("NeutralPair", + "(float) Number of neutral pairs, i.e. the pairs of items " + "that have the same score.") + .AsDispensable(); + AddAttr( + "column", + "(int, default -1) The column position of Score used to rank items in " + "descending order. It must be in the range of [-rank(Score), " + "rank(Score)). " + "If `dim < 0`, the dim to reduce is `rank + dim`. " + "Noting that reducing on the first dim will make the LoD info lost.") + .SetDefault(0); + AddComment(R"DOC( +PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's +performance. + +Within some context, e.g. the "query", a LTR model generates scores for a list +of items, which gives a partial order of the items. PositiveNegativePairOp +takes a list of reference rank order (Input("Label")) and the model generated +scores (Input(Score)) as inputs and counts the pairs that ranked correctly +and incorrectly. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair, + ops::PositiveNegativePairOp, + ops::PositiveNegativePairOpMaker); +REGISTER_OP_CPU_KERNEL( + positive_negative_pair, + ops::PositiveNegativePairKernel, + ops::PositiveNegativePairKernel); diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f20f33bbeb19766d6974ea17b155cac363c01fb2 --- /dev/null +++ b/paddle/fluid/operators/positive_negative_pair_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/utils/Logging.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class PositiveNegativePairKernel : public framework::OpKernel { + public: + struct PredictionResult { + PredictionResult(T score, T label, T weight) + : score(score), label(label), weight(weight) {} + T score; + T label; + T weight; + }; + + void Compute(const framework::ExecutionContext& context) const override { + auto score_t = context.Input("Score"); + auto label_t = context.Input("Label"); + auto query_t = context.Input("QueryID"); + auto acc_positive_t = context.Input("AccumulatePositivePair"); + auto acc_negative_t = context.Input("AccumulateNegativePair"); + auto acc_neutral_t = context.Input("AccumulateNeutralPair"); + auto positive_t = context.Output("PositivePair"); + auto negative_t = context.Output("NegativePair"); + auto neutral_t = context.Output("NeutralPair"); + auto weight_t = context.Input("Weight"); + + auto score = score_t->data(); + auto label = label_t->data(); + auto query = query_t->data(); + const T* weight = nullptr; + if (weight_t != nullptr) { + weight = weight_t->data(); + } + T* positive = positive_t->mutable_data(context.GetPlace()); + T* negative = negative_t->mutable_data(context.GetPlace()); + T* neutral = neutral_t->mutable_data(context.GetPlace()); + + auto score_dim = score_t->dims(); + auto batch_size = score_dim[0]; + auto width = score_dim[1]; + auto column = context.Attr("column"); + if (column < 0) { + column += width; + } + + // construct document instances for each query: Query => List[, ...] + std::unordered_map> predictions; + for (auto i = 0; i < batch_size; ++i) { + if (predictions.find(query[i]) == predictions.end()) { + predictions.emplace( + std::make_pair(query[i], std::vector())); + } + predictions[query[i]].emplace_back(score[i * width + column], label[i], + weight_t != nullptr ? weight[i] : 1.0); + } + + // for each query, accumulate pair counts + T pos = 0, neg = 0, neu = 0; + if (acc_positive_t != nullptr && acc_negative_t != nullptr && + acc_neutral_t != nullptr) { + pos = acc_positive_t->data()[0]; + neg = acc_negative_t->data()[0]; + neu = acc_neutral_t->data()[0]; + } + auto evaluate_one_list = [&pos, &neg, + &neu](std::vector vec) { + for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) { + for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) { + if (ite1->label == ite2->label) { // labels are equal, ignore. + continue; + } + T w = (ite1->weight + ite2->weight) * 0.5; + if (ite1->score == ite2->score) { + neu += w; + } + (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0 + ? pos += w + : neg += w; + } + } + }; + for (auto prediction : predictions) { + evaluate_one_list(prediction.second); + } + *positive = pos; + *negative = neg; + *neutral = neu; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/precision_recall_op.cc b/paddle/fluid/operators/precision_recall_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..30d594719c7274b90a88127028035a49c25e32e7 --- /dev/null +++ b/paddle/fluid/operators/precision_recall_op.cc @@ -0,0 +1,182 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/precision_recall_op.h" + +namespace paddle { +namespace operators { + +class PrecisionRecallOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("MaxProbs"), + "Input(MaxProbs) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input(Indices) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"), + "Output(BatchMetrics) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"), + "Output(AccumMetrics) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"), + "Output(AccumStatesInfo) should not be null."); + + int64_t cls_num = + static_cast(ctx->Attrs().Get("class_number")); + auto max_probs_dims = ctx->GetInputDim("MaxProbs"); + auto labels_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(max_probs_dims[1], 1, + "Each instance contains one max probability, so the " + "shape of Input(MaxProbs) should be [batch_size, 1]."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims, + "The shape of Input(Indices) should be [batch_size, 1]."); + PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0], + "The 1st dimension of Input(MaxProbs) and " + "Input(Labels) both are batch_size and the shape should " + "be the same."); + PADDLE_ENFORCE_EQ(labels_dims[1], 1, + "The 2nd dimension of Input(Labels) contains instance " + "label and the shape should be equal to 1."); + if (ctx->HasInput("Weights")) { + auto weights_dims = ctx->GetInputDim("Weights"); + PADDLE_ENFORCE_EQ(weights_dims, + framework::make_ddim({max_probs_dims[0], 1}), + "The shape of Input(Weights) should be " + "[batch_size, 1]."); + } + if (ctx->HasInput("StatesInfo")) { + auto states_dims = ctx->GetInputDim("StatesInfo"); + PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}), + "The shape of Input(StatesInfo) should be " + "[class_number, 4]."); + } + + // Layouts of BatchMetrics and AccumMetrics both are: + // [ + // macro average precision, macro average recall, macro average F1 score, + // micro average precision, micro average recall, micro average F1 score + // ] + ctx->SetOutputDim("BatchMetrics", {6}); + ctx->SetOutputDim("AccumMetrics", {6}); + // Shape of AccumStatesInfo is [class_number, 4] + // The layout of each row is: + // [ TP, FP, TN, FN ] + ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("MaxProbs")->type()), + ctx.device_context()); + } +}; + +class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("MaxProbs", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. Each row contains the max probability " + "of an instance which computed by the previous top_k (k=1) " + "operator."); + AddInput("Indices", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. Each row contains the corresponding " + "index which computed by the previous top_k (k=1) operator."); + AddInput("Labels", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. Each element is a label and the " + "value should be in [0, class_number - 1]."); + AddInput("Weights", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. This input is optional. If provided, " + "weight of instance would be considered when computing metrics.") + .AsDispensable(); + AddInput("StatesInfo", + "(Tensor, default Tensor) A 2-D tensor with shape D x 4, " + "where D is the number of classes. This input is optional. If " + "provided, current state will be accumulated to this state and " + "the accumulation state will be the output state.") + .AsDispensable(); + AddOutput("BatchMetrics", + "(Tensor, default Tensor) A 1-D tensor with shape {6}. " + "This output tensor contains metrics for current batch data. " + "The layout is [macro average precision, macro average recall, " + "macro f1 score, micro average precision, micro average recall, " + "micro f1 score]."); + AddOutput("AccumMetrics", + "(Tensor, default Tensor) A 1-D tensor with shape {6}. " + "This output tensor contains metrics for accumulated data. " + "The layout is [macro average precision, macro average recall, " + "macro f1 score, micro average precision, micro average recall, " + "micro f1 score]."); + AddOutput("AccumStatesInfo", + "(Tensor, default Tensor) A 2-D tensor with shape D x 4, " + "where D is equal to class number. This output tensor contains " + "accumulated state variables used to compute metrics. The layout " + "for each class is [true positives, false positives, " + "true negatives, false negatives]."); + AddAttr("class_number", "(int) Number of classes to be evaluated."); + AddComment(R"DOC( +Precision Recall Operator. + +When given Input(Indices) and Input(Labels), this operator can be used +to compute various metrics including: +1. macro average precision +2. macro average recall +3. macro f1 score +4. micro average precision +5. micro average recall +6. micro f1 score + +To compute the above metrics, we need to do statistics for true positives, +false positives and false negatives. Here the count of true negatives is not +necessary, but counting it may provide potential usage and the cost is +trivial, so the operator also provides the count of true negatives. + +We define state as a 2-D tensor with shape [class_number, 4]. Each row of a +state contains statistic variables for corresponding class. Layout of each row +is: TP(true positives), FP(false positives), TN(true negatives), +FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be +calculated by given weight instead of the instance count. + +This operator also supports metrics computing for cross-batch situation. To +achieve this, Input(StatesInfo) should be provided. State of current batch +data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo) +is the accumulation state. + +Output(BatchMetrics) is metrics of current batch data while +Output(AccumStatesInfo) is metrics of accumulation data. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp, + ops::PrecisionRecallOpMaker); +REGISTER_OP_CPU_KERNEL( + precision_recall, + ops::PrecisionRecallKernel, + ops::PrecisionRecallKernel); diff --git a/paddle/fluid/operators/precision_recall_op.h b/paddle/fluid/operators/precision_recall_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7dae86b76fc8d50e2ed5fe353920b68f7a846fb1 --- /dev/null +++ b/paddle/fluid/operators/precision_recall_op.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +enum StateVariable { TP = 0, FP, TN, FN }; + +template +class PrecisionRecallKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in0 = ctx.Input("Indices"); + auto* in1 = ctx.Input("Labels"); + auto* in2 = ctx.Input("Weights"); + auto* in3 = ctx.Input("StatesInfo"); + auto* out0 = ctx.Output("BatchMetrics"); + auto* out1 = ctx.Output("AccumMetrics"); + auto* out2 = ctx.Output("AccumStatesInfo"); + + const int* ids_data = in0->data(); + const int* labels_data = in1->data(); + size_t cls_num = static_cast(ctx.Attr("class_number")); + const T* weights_data = in2 ? in2->data() : nullptr; + const T* states_data = in3 ? in3->data() : nullptr; + double* batch_metrics_data = out0->mutable_data(ctx.GetPlace()); + double* accum_metrics_data = out1->mutable_data(ctx.GetPlace()); + out2->mutable_data(ctx.GetPlace()); + auto accum_states = EigenMatrix::From(*out2); + accum_states.setZero(); + T* accum_states_data = out2->data(); + + size_t sample_num = in0->dims()[0]; + size_t state_var_num = 4; // TP FP TN FN + + // get states info for current batch + for (size_t i = 0; i < sample_num; ++i) { + size_t idx = ids_data[i]; + size_t label = labels_data[i]; + + PADDLE_ENFORCE(idx >= 0 && idx < cls_num, + "Class index of each instance should be in " + "[0, class_number)."); + PADDLE_ENFORCE(label >= 0 && label < cls_num, + "Label of each instance should be in [0, class_number)."); + + T w = weights_data ? weights_data[i] : 1.0; + if (idx == label) { + accum_states_data[idx * state_var_num + TP] += w; + for (size_t j = 0; j < cls_num; ++j) { + accum_states_data[j * state_var_num + TN] += w; + } + accum_states_data[idx * state_var_num + TN] -= w; + } else { + accum_states_data[label * state_var_num + FN] += w; + accum_states_data[idx * state_var_num + FP] += w; + for (size_t j = 0; j < cls_num; ++j) { + accum_states_data[j * state_var_num + TN] += w; + } + accum_states_data[idx * state_var_num + TN] -= w; + accum_states_data[label * state_var_num + TN] -= w; + } + } + + ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num, + cls_num); + + if (states_data) { + for (size_t i = 0; i < cls_num; ++i) { + for (size_t j = 0; j < state_var_num; ++j) { + size_t idx = i * state_var_num + j; + accum_states_data[idx] += states_data[idx]; + } + } + } + + ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num, + cls_num); + } + + // expose to be reused + static inline T CalcPrecision(T tp_count, T fp_count) { + if (tp_count > 0.0 || fp_count > 0.0) { + return tp_count / (tp_count + fp_count); + } + return 1.0; + } + + static inline T CalcRecall(T tp_count, T fn_count) { + if (tp_count > 0.0 || fn_count > 0.0) { + return tp_count / (tp_count + fn_count); + } + return 1.0; + } + + static inline T CalcF1Score(T precision, T recall) { + if (precision > 0.0 || recall > 0.0) { + return 2 * precision * recall / (precision + recall); + } + return 0.0; + } + + protected: + void ComputeMetrics(const T* states_data, double* metrics_data, + size_t state_var_num, size_t cls_num) const { + T total_tp_count = 0; + T total_fp_count = 0; + T total_fn_count = 0; + T macro_avg_precision = 0.0; + T macro_avg_recall = 0.0; + + for (size_t i = 0; i < cls_num; ++i) { + T tp_count = states_data[i * state_var_num + TP]; + T fp_count = states_data[i * state_var_num + FP]; + T fn_count = states_data[i * state_var_num + FN]; + total_tp_count += tp_count; + total_fp_count += fp_count; + total_fn_count += fn_count; + macro_avg_precision += CalcPrecision(tp_count, fp_count); + macro_avg_recall += CalcRecall(tp_count, fn_count); + } + macro_avg_precision /= cls_num; + macro_avg_recall /= cls_num; + T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall); + + T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count); + T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count); + T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall); + + // fill metrics data + metrics_data[0] = macro_avg_precision; + metrics_data[1] = macro_avg_recall; + metrics_data[2] = macro_f1_score; + metrics_data[3] = micro_avg_precision; + metrics_data[4] = micro_avg_recall; + metrics_data[5] = micro_f1_score; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..22b970d971221e86a25e0b72f3d3704e5cee5d7f --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prelu_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +class PReluOp : public framework::OperatorWithKernel { + public: + PReluOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null"); + PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1, + "Size of weight Alpha must be one."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class PReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of prelu operator."); + AddInput("Alpha", "The alpha weight of prelu operator."); + AddOutput("Out", "The output tensor of prelu operator."); + AddComment(R"DOC( +PRelu Operator. + +The equation is: + +$$ +f(x) = +\begin{cases} +\alpha * x, \quad \text{if} \ x < 0 \\ +x, \qquad \text{if} \ x >= 0 +\end{cases} +$$ + +The input `X` can carry the LoD (Level of Details) information, +or not. And the output shares the LoD information with input `X`. + +)DOC"); + } +}; + +// The operator to calculate gradients of a prelu operator. +class PReluGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("Alpha"), + ctx->GetInputDim("Alpha")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad, + ops::PReluGradOp); +REGISTER_OP_CPU_KERNEL( + prelu, ops::PReluKernel); +REGISTER_OP_CPU_KERNEL( + prelu_grad, + ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..038b09a493c5064d5419260b2fbfdf56b6bb5982 --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prelu_op.h" + +REGISTER_OP_CUDA_KERNEL( + prelu, + paddle::operators::PReluKernel); +REGISTER_OP_CUDA_KERNEL(prelu_grad, + paddle::operators::PReluGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..85ad75d479001ec5dad1b796d4932c7e6c4ab7af --- /dev/null +++ b/paddle/fluid/operators/prelu_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using platform::Transform; + +template +class PReluFunctor { + public: + explicit PReluFunctor(const T* alpha) : alpha_(alpha) {} + + HOSTDEVICE T operator()(const T& x) const { + if (x > 0) + return x; + else + return x * (*alpha_); + } + + private: + const T* alpha_; +}; + +template +class PReluKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* alpha = context.Input("Alpha"); + auto* out = context.Output("Out"); + + const T* x_ptr = x->data(); + T* o_ptr = out->mutable_data(context.GetPlace()); + + auto* alpha_ptr = alpha->data(); + + int numel = x->numel(); + + Transform trans; + trans(context.template device_context(), x_ptr, + x_ptr + numel, o_ptr, PReluFunctor(alpha_ptr)); + } +}; + +template +class PReluGradFunctor { + public: + explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {} + + HOSTDEVICE T operator()(const T& out, const T& dout) const { + if (out > 0) + return dout; + else + return dout * (*alpha_); + } + + private: + const T* alpha_; +}; + +template +class PReluGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dx = context.Output(framework::GradVarName("X")); + auto* dout = context.Input(framework::GradVarName("Out")); + + auto* out = context.Input("Out"); + auto* alpha = context.Input("Alpha"); + auto* alpha_ptr = alpha->data(); + + T* dx_ptr = dx->mutable_data(context.GetPlace()); + const T* dout_ptr = dout->data(); + const T* out_ptr = out->data(); + int numel = dx->numel(); + + Transform trans; + trans(context.template device_context(), out_ptr, + out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor(alpha_ptr)); + + // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3616545309e8c279f61a22e571a5e71335c47f93 --- /dev/null +++ b/paddle/fluid/operators/print_op.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace operators { + +#define CLOG std::cout + +const std::string kForward = "FORWARD"; +const std::string kBackward = "BACKWARD"; +const std::string kBoth = "BOTH"; + +struct Formater { + std::string message; + std::string name; + std::vector dims; + std::type_index dtype{typeid(char)}; + framework::LoD lod; + int summarize; + void* data{nullptr}; + + void operator()(size_t size) { + PrintMessage(); + PrintName(); + PrintDims(); + PrintDtype(); + PrintLod(); + PrintData(size); + } + + private: + void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; } + void PrintName() { + if (!name.empty()) { + CLOG << "Tensor[" << name << "]" << std::endl; + } + } + void PrintDims() { + if (!dims.empty()) { + CLOG << "\tshape: ["; + for (auto i : dims) { + CLOG << i << ","; + } + CLOG << "]" << std::endl; + } + } + void PrintDtype() { + if (dtype.hash_code() != typeid(char).hash_code()) { + CLOG << "\tdtype: " << dtype.name() << std::endl; + } + } + void PrintLod() { + if (!lod.empty()) { + CLOG << "\tLoD: ["; + for (auto level : lod) { + CLOG << "[ "; + for (auto i : level) { + CLOG << i << ","; + } + CLOG << " ]"; + } + CLOG << "]" << std::endl; + } + } + + void PrintData(size_t size) { + PADDLE_ENFORCE_NOT_NULL(data); + // print float + if (dtype.hash_code() == typeid(float).hash_code()) { + Display(size); + } + if (dtype.hash_code() == typeid(double).hash_code()) { + Display(size); + } + if (dtype.hash_code() == typeid(int).hash_code()) { + Display(size); + } + if (dtype.hash_code() == typeid(int64_t).hash_code()) { + Display(size); + } + } + + template + void Display(size_t size) { + auto* d = (T*)data; + CLOG << "\tdata: "; + if (summarize != -1) { + summarize = std::min(size, (size_t)summarize); + for (int i = 0; i < summarize; i++) { + CLOG << d[i] << ","; + } + } else { + for (size_t i = 0; i < size; i++) { + CLOG << d[i] << ","; + } + } + CLOG << std::endl; + } +}; + +// TODO(ChunweiYan) there should be some other printers for TensorArray +class TensorPrintOp : public framework::OperatorBase { + public: + TensorPrintOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + TensorPrintOp(const TensorPrintOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not implemented."); + } + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + const framework::Variable* in_var_ptr = nullptr; + std::string phase = kForward; + std::string printed_var_name = ""; + + auto& inputs = Inputs(); + if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) { + in_var_ptr = scope.FindVar(Input("In")); + printed_var_name = Inputs("In").front(); + } else if (inputs.find("In@GRAD") != inputs.end() && + !Inputs("In@GRAD").empty()) { + in_var_ptr = scope.FindVar(Input("In@GRAD")); + printed_var_name = Inputs("In@GRAD").front(); + phase = kBackward; + } else { + PADDLE_THROW("Unknown phase, should be forward or backward."); + } + + PADDLE_ENFORCE_NOT_NULL(in_var_ptr); + + auto& in_tensor = in_var_ptr->Get(); + auto* out_var_ptr = scope.FindVar(Output("Out")); + auto& out_tensor = *out_var_ptr->GetMutable(); + + // Just copy data from input tensor to output tensor + // output tensor share same memory with input tensor + out_tensor.ShareDataWith(in_tensor); + out_tensor.set_lod(in_tensor.lod()); + + std::string print_phase = Attr("print_phase"); + if (print_phase != phase && print_phase != kBoth) { + return; + } + + int first_n = Attr("first_n"); + if (first_n > 0 && ++times_ > first_n) return; + + framework::LoDTensor printed_tensor; + printed_tensor.set_lod(in_tensor.lod()); + printed_tensor.Resize(in_tensor.dims()); + + if (platform::is_cpu_place(in_tensor.place())) { + printed_tensor.ShareDataWith(in_tensor); + } else { + // copy data to cpu to print + platform::CPUPlace place; + framework::Copy(in_tensor, place, &printed_tensor); + } + + Formater formater; + if (Attr("print_tensor_name")) { + formater.name = printed_var_name; + } + if (Attr("print_tensor_type")) { + formater.dtype = printed_tensor.type(); + } + if (Attr("print_tensor_shape")) { + auto& dims = printed_tensor.dims(); + formater.dims.resize(dims.size()); + for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i]; + } + if (Attr("print_tensor_lod")) { + formater.lod = printed_tensor.lod(); + } + formater.summarize = Attr("summarize"); + formater.data = (void*)printed_tensor.data(); + formater(printed_tensor.numel()); + } + + private: + mutable int times_{0}; +}; + +class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker { + public: + PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("In", "Input tensor to be displayed."); + AddAttr("first_n", "Only log `first_n` number of times."); + AddAttr("message", "A string message to print as a prefix."); + AddAttr("summarize", "Number of elements printed."); + AddAttr("print_tensor_name", "Whether to print the tensor name."); + AddAttr("print_tensor_type", "Whether to print the tensor's dtype."); + AddAttr("print_tensor_shape", "Whether to print the tensor's shape."); + AddAttr("print_tensor_lod", "Whether to print the tensor's lod."); + AddAttr( + "print_phase", + "(string, default 'BOTH') Which phase to display including 'FORWARD' " + "'BACKWARD' and 'BOTH'.") + .SetDefault(kBoth) + .InEnum({kForward, kBackward, kBoth}); + AddOutput("Out", "Output tensor with same data as input tensor."); + AddComment(R"DOC( +Creates a print op that will print when a tensor is accessed. + +Wraps the tensor passed in so that whenever that a tensor is accessed, +the message `message` is printed, along with the current value of the +tensor `t`.)DOC"); + } +}; + +class InferShapeForward : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null."); + context->ShareLoD("In", /*->*/ "Out"); + context->SetOutputDim("Out", context->GetInputDim("In")); + } +}; + +class InferShapeBackward : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("In@GRAD"), + "Input(In@GRAD) should not be null."); + context->ShareLoD("In@GRAD", /*->*/ "Out"); + context->SetOutputDim("Out", context->GetInputDim("In@GRAD")); + } +}; + +class InferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override {} +}; + +class PrintOpProtoAndCheckGradOpMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("print_grad"); + op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out")); + op_desc_ptr->SetOutput("Out", InputGrad("In")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker, + ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward, + ops::InferVarType); +REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward); diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed48603e17f38f89705186fb9fb992f69d26d2ff --- /dev/null +++ b/paddle/fluid/operators/prior_box_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prior_box_op.h" + +namespace paddle { +namespace operators { + +class PriorBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of PriorBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Image"), + "Input(Image) of PriorBoxOp should not be null."); + + auto image_dims = ctx->GetInputDim("Image"); + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW."); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + PADDLE_ENFORCE_LT(input_dims[2], image_dims[2], + "The height of input must smaller than image."); + + PADDLE_ENFORCE_LT(input_dims[3], image_dims[3], + "The width of input must smaller than image."); + + auto min_sizes = ctx->Attrs().Get>("min_sizes"); + auto max_sizes = ctx->Attrs().Get>("max_sizes"); + auto variances = ctx->Attrs().Get>("variances"); + auto aspect_ratios = ctx->Attrs().Get>("aspect_ratios"); + bool flip = ctx->Attrs().Get("flip"); + + std::vector aspect_ratios_vec; + ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); + + int num_priors = aspect_ratios_vec.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(), + "The number of min_size and max_size must be equal."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i], + "max_size[%d] must be greater than min_size[%d].", i, + i); + num_priors += 1; + } + } + + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } +}; + +class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor, default Tensor), " + "the input feature data of PriorBoxOp, The layout is NCHW."); + AddInput("Image", + "(Tensor, default Tensor), " + "the input image data of PriorBoxOp, The layout is NCHW."); + AddOutput("Boxes", + "(Tensor, default Tensor), the output prior boxes of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + + AddAttr>("min_sizes", + "(vector) List of min sizes " + "of generated prior boxes.") + .AddCustomChecker([](const std::vector& min_sizes) { + PADDLE_ENFORCE_GT(min_sizes.size(), 0, + "Size of min_sizes must be at least 1."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(min_sizes[i], 0, + "min_sizes[%d] must be positive.", i); + } + }); + AddAttr>( + "max_sizes", + "(vector) List of max sizes of generated prior boxes."); + AddAttr>( + "aspect_ratios", + "(vector) List of aspect ratios of generated prior boxes."); + + AddAttr>( + "variances", + "(vector) List of variances to be encoded in prior boxes.") + .AddCustomChecker([](const std::vector& variances) { + PADDLE_ENFORCE_EQ(variances.size(), 4, + "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + }); + AddAttr("flip", "(bool) Whether to flip aspect ratios.") + .SetDefault(true); + AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") + .SetDefault(true); + + AddAttr("step_w", + "Prior boxes step across width, 0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_w) { + PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0."); + }); + AddAttr("step_h", + "Prior boxes step across height, 0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_h) { + PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0."); + }); + + AddAttr("offset", + "(float) " + "Prior boxes center offset.") + .SetDefault(0.5); + AddComment(R"DOC( +Prior box operator +Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. +Each position of the input produce N prior boxes, N is determined by + the count of min_sizes, max_sizes and aspect_ratios, The size of the + box is in range(min_size, max_size) interval, which is generated in + sequence according to the aspect_ratios. + +Please get more information from the following papers: +https://arxiv.org/abs/1512.02325. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker); +REGISTER_OP_CPU_KERNEL( + prior_box, ops::PriorBoxOpKernel, + ops::PriorBoxOpKernel); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fd07041233495660605e9cf9acb33d57eb57bc30 --- /dev/null +++ b/paddle/fluid/operators/prior_box_op.h @@ -0,0 +1,201 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, + bool flip, + std::vector& output_aspect_ratior) { + constexpr float epsilon = 1e-6; + output_aspect_ratior.clear(); + output_aspect_ratior.push_back(1.0f); + for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { + float ar = input_aspect_ratior[i]; + bool already_exist = false; + for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { + if (fabs(ar - output_aspect_ratior[j]) < epsilon) { + already_exist = true; + break; + } + } + if (!already_exist) { + output_aspect_ratior.push_back(ar); + if (flip) { + output_aspect_ratior.push_back(1.0f / ar); + } + } + } +} + +template +struct ClipFunctor { + HOSTDEVICE inline T operator()(T in) const { + return std::min(std::max(in, 0.), 1.); + } +}; + +template +class PriorBoxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto min_sizes = ctx.Attr>("min_sizes"); + auto max_sizes = ctx.Attr>("max_sizes"); + auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); + auto variances = ctx.Attr>("variances"); + auto flip = ctx.Attr("flip"); + auto clip = ctx.Attr("clip"); + + std::vector aspect_ratios; + ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = aspect_ratios.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + num_priors += max_sizes.size(); + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + T inv_img_width = 1.0 / img_width; + T inv_img_height = 1.0 / img_height; + + auto e_boxes = framework::EigenTensor::From(*boxes); + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + T box_width, box_height; + int idx = 0; + for (size_t s = 0; s < min_sizes.size(); ++s) { + int min_size = min_sizes[s]; + // first prior: aspect_ratio = 1, size = min_size + box_width = box_height = min_size; + // xmin + e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width; + // ymin + e_boxes(h, w, idx, 1) = + (center_y - box_height * 0.5) * inv_img_height; + // xmax + e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width; + // ymax + e_boxes(h, w, idx, 3) = + (center_y + box_height * 0.5) * inv_img_height; + + idx++; + if (max_sizes.size() > 0) { + int max_size = max_sizes[s]; + // second prior: aspect_ratio = 1, + // size = sqrt(min_size * max_size) + box_width = box_height = sqrt(min_size * max_size); + // xmin + e_boxes(h, w, idx, 0) = + (center_x - box_width * 0.5) * inv_img_width; + // ymin + e_boxes(h, w, idx, 1) = + (center_y - box_height * 0.5) * inv_img_height; + // xmax + e_boxes(h, w, idx, 2) = + (center_x + box_width * 0.5) * inv_img_width; + // ymax + e_boxes(h, w, idx, 3) = + (center_y + box_height * 0.5) * inv_img_height; + idx++; + } + + // rest of priors + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + if (fabs(ar - 1.) < 1e-6) { + continue; + } + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + // xmin + e_boxes(h, w, idx, 0) = + (center_x - box_width * 0.5) * inv_img_width; + // ymin + e_boxes(h, w, idx, 1) = + (center_y - box_height * 0.5) * inv_img_height; + // xmax + e_boxes(h, w, idx, 2) = + (center_x + box_width * 0.5) * inv_img_width; + // ymax + e_boxes(h, w, idx, 3) = + (center_y + box_height * 0.5) * inv_img_height; + idx++; + } + } + } + } + + if (clip) { + platform::Transform trans; + ClipFunctor clip_func; + trans(ctx.template device_context(), + boxes->data(), boxes->data() + boxes->numel(), + boxes->data(), clip_func); + } + + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + auto var_et = framework::EigenTensor::From(var_t); + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int box_num = feature_height * feature_width * num_priors; + auto var_dim = vars->dims(); + vars->Resize({box_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); + e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); + + vars->Resize(var_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d9e3894c576a94c094c0f4b72e3b6519c4ec26e1 --- /dev/null +++ b/paddle/fluid/operators/proximal_adagrad_op.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/proximal_adagrad_op.h" + +namespace paddle { +namespace operators { + +class ProximalAdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("LearningRate"), + "Input(LearningRate) of ProximalAdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("MomentOut"), + "Output(MomentOut) of ProximalAdagradOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad of ProximalAdagrad Op must have same dimension."); + + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Moment"), + "Param and Moment of ProximalAdagrad Op must have same dimension."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("MomentOut", param_dim); + } +}; + +class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated."); + AddInput("Moment", + "(Tensor, default Tensor) " + "Moment parameter that has to be updated."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + AddOutput("MomentOut", "(Tensor) Output updated moment value."); + + AddAttr("l1", + "(float, default 0.0) " + "L1 regularization strength.") + .SetDefault(0.0f); + AddAttr("l2", + "(float, default 0.0) " + "L2 regularization strength.") + .SetDefault(0.0f); + AddComment(R"DOC( +Proximal Adagrad Optimizer. + +Optimizer that implements the proximal adagrad algorithm: + +$$ +moment = moment + grad * grad \\ +prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\ +param = sign(prox\_param) / (1 + learning\_rate * l2) * + \max(|prox\_param| - learning\_rate * l1 , 0) +$$ + +The paper that proposed Proximal GD: +(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf) +Here, we use the adagrad learning rate as specified here: +(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp, + ops::ProximalAdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/proximal_adagrad_op.cu b/paddle/fluid/operators/proximal_adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..54c75b3abb8e84b2aa55d044e79423cf86523d76 --- /dev/null +++ b/paddle/fluid/operators/proximal_adagrad_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/proximal_adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/proximal_adagrad_op.h b/paddle/fluid/operators/proximal_adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..70205a8d11f757d08150a81d1369133778ad996c --- /dev/null +++ b/paddle/fluid/operators/proximal_adagrad_op.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class ProximalAdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + auto* moment_out = ctx.Output("MomentOut"); + + param_out->mutable_data(ctx.GetPlace()); + moment_out->mutable_data(ctx.GetPlace()); + + auto l1 = static_cast(ctx.Attr("l1")); + auto l2 = static_cast(ctx.Attr("l2")); + + auto grad = ctx.Input("Grad"); + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto m = EigenVector::Flatten(*ctx.Input("Moment")); + auto g = EigenVector::Flatten(*grad); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + + auto p_out = EigenVector::Flatten(*param_out); + auto m_out = EigenVector::Flatten(*moment_out); + auto* place = ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + m_out.device(*place) = m + g * g; + auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt(); + if (l1 > static_cast(0)) { + p_out.device(*place) = + prox_param.sign() * + (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) + .cwiseMax(static_cast(0.0))) / + (static_cast(1.0) + (lr * l2).broadcast(grad_dsize))); + } else { + p_out.device(*place) = + prox_param / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..de7c6843c8ba7c1cc9229a11707de7a1400deee1 --- /dev/null +++ b/paddle/fluid/operators/proximal_gd_op.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/proximal_gd_op.h" + +namespace paddle { +namespace operators { + +class ProximalGDOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of ProximalGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of ProximalGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of ProximalGDOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of ProximalGDOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"), + "Two input of ProximalGD Op's dimension must be same."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + } +}; + +class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter value that has to be updated."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + + AddAttr("l1", + "(float, default 0.0) " + "L1 regularization strength.") + .SetDefault(0.0f); + AddAttr("l2", + "(float, default 0.0) " + "L2 regularization strength.") + .SetDefault(0.0f); + AddComment(R"DOC( +ProximalGD Operator. + +Optimizer that implements the proximal gradient descent algorithm: + +$$ +prox\_param = param - learning\_rate * grad \\ +param = sign(prox\_param) / (1 + learning\_rate * l2) * + \max(|prox\_param| - learning\_rate * l1, 0) +$$ + +The paper that proposed Proximal Gradient Descent: +(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp, + ops::ProximalGDOpMaker); +REGISTER_OP_CPU_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/proximal_gd_op.cu b/paddle/fluid/operators/proximal_gd_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..97b672e872c99783ef4c0cd085e4b86380a06e10 --- /dev/null +++ b/paddle/fluid/operators/proximal_gd_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/proximal_gd_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/proximal_gd_op.h b/paddle/fluid/operators/proximal_gd_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8372380f25277e7774b72e144e00ea80e76a71e0 --- /dev/null +++ b/paddle/fluid/operators/proximal_gd_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class ProximalGDOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + + param_out->mutable_data(ctx.GetPlace()); + + auto grad = ctx.Input("Grad"); + + auto l1 = static_cast(ctx.Attr("l1")); + auto l2 = static_cast(ctx.Attr("l2")); + + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto g = EigenVector::Flatten(*grad); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + + auto p_out = EigenVector::Flatten(*param_out); + auto& place = *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + auto prox_param = p - lr.broadcast(grad_dsize) * g; + if (l1 > 0) { + p_out.device(place) = + prox_param.sign() * + (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) + .cwiseMax(T(0.0))) / + (1.0 + (lr * l2).broadcast(grad_dsize))); + } else { + p_out.device(place) = + prox_param / (1.0 + (lr * l2).broadcast(grad_dsize)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..222ca73d2acfa8cc3d6fa6a3badce4606be9bcb0 --- /dev/null +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/rank_loss_op.h" + +namespace paddle { +namespace operators { + +class RankLossOp : public framework::OperatorWithKernel { + public: + RankLossOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null."); + + auto label_dims = ctx->GetInputDim("Label"); + auto left_dims = ctx->GetInputDim("Left"); + auto right_dims = ctx->GetInputDim("Right"); + + PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims), + "All inputs must have the same size."); + PADDLE_ENFORCE( + (label_dims.size() == 2) && (label_dims[1] == 1), + "All inputs must be 2-D tensors with shape [batch_size x 1]."); + ctx->SetOutputDim("Out", label_dims); + } +}; + +class RankLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Label", + "(2-D Tensor with shape [batch_size x 1]) " + "The label indicating A ranked higher than B or not."); + AddInput("Left", + "(2-D Tensor with shape [batch_size x 1]) " + "The output of RankNet for doc A."); + AddInput("Right", + "(2-D Tensor with shape [batch_size x 1]) " + "The output of RankNet for doc B."); + AddOutput("Out", + "(2-D Tensor with shape [batch_size x 1]) " + "The output loss of RankLoss operator."); + AddComment(R"DOC( +RankLoss Operator. + +RankLoss operator for RankNet +(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). +RankNet is a pairwise ranking model with +one training sample consisting of a pair of doc A and B, and the label P +indicating that A is ranked higher than B or not: + +P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of +the input pair. + +The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label +(P_{i,j}), which represent the output score of RankNet for the two docs and +the label respectively, and yields the rank loss C_{i,j} using the following +equation: + +$$ + C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\ + o_{i,j} = o_i - o_j \\ + \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} +$$ + +The operator can take batch inputs with size batch_size (batch_size >= 1). + +)DOC"); + } +}; + +class RankLossGradOp : public framework::OperatorWithKernel { + public: + RankLossGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + auto dims = ctx->GetInputDim("Left"); + auto left_grad_name = framework::GradVarName("Left"); + auto right_grad_name = framework::GradVarName("Right"); + + if (ctx->HasOutput(left_grad_name)) { + ctx->SetOutputDim(left_grad_name, dims); + } + + if (ctx->HasOutput(right_grad_name)) { + ctx->SetOutputDim(right_grad_name, dims); + } + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad, + ops::RankLossGradOp); +REGISTER_OP_CPU_KERNEL( + rank_loss, ops::RankLossKernel); +REGISTER_OP_CPU_KERNEL( + rank_loss_grad, + ops::RankLossGradKernel); diff --git a/paddle/fluid/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1b182ced70d2a234237f1de822c3dc81047ebda7 --- /dev/null +++ b/paddle/fluid/operators/rank_loss_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/rank_loss_op.h" + +REGISTER_OP_CUDA_KERNEL(rank_loss, + paddle::operators::RankLossKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(rank_loss_grad, + paddle::operators::RankLossGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..08bb2c28218e8c478af426b560efc9a4b6161696 --- /dev/null +++ b/paddle/fluid/operators/rank_loss_op.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class RankLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* label_t = ctx.Input("Label"); + auto* left_t = ctx.Input("Left"); + auto* right_t = ctx.Input("Right"); + out_t->mutable_data(ctx.GetPlace()); + + auto out = framework::EigenVector::Flatten(*out_t); + auto label = framework::EigenVector::Flatten(*label_t); + auto left = framework::EigenVector::Flatten(*left_t); + auto right = framework::EigenVector::Flatten(*right_t); + + auto& dev = *ctx.template device_context().eigen_device(); + out.device(dev) = + (1. + (left - right).exp()).log() - label * (left - right); + } +}; + +template +class RankLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_left_t = + ctx.Output(framework::GradVarName("Left")); + auto* d_right_t = + ctx.Output(framework::GradVarName("Right")); + + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* label_t = ctx.Input("Label"); + auto* left_t = ctx.Input("Left"); + auto* right_t = ctx.Input("Right"); + + auto& dev = *ctx.template device_context().eigen_device(); + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto label = framework::EigenVector::Flatten(*label_t); + auto left = framework::EigenVector::Flatten(*left_t); + auto right = framework::EigenVector::Flatten(*right_t); + + // compute d_left + if (d_left_t) { + d_left_t->mutable_data(ctx.GetPlace()); + auto d_left = framework::EigenVector::Flatten(*d_left_t); + d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label); + } + // compute d_right + if (d_right_t) { + d_right_t->mutable_data(ctx.GetPlace()); + auto d_right = framework::EigenVector::Flatten(*d_right_t); + d_right.device(dev) = + -d_out * (1.0 / (1. + (right - left).exp()) - label); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d562c291911f54c9d1e8fed2e84035808bffbb7 --- /dev/null +++ b/paddle/fluid/operators/read_op.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { + +class ReadInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Reader"), + "The ReadOp must take a reader as input."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), + "The ReadOp should be assigned with output."); + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); + } +}; + +class ReadInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarDesc::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } + } +}; + +class ReadOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + framework::ReaderHolder* reader = + scope.FindVar(Input("Reader"))->GetMutable(); + if (!reader->HasNext()) { + reader->ReInit(); + PADDLE_ENFORCE( + reader->HasNext(), + "Reader can not read the next data even it has been re-initialized."); + } + std::vector out_arg_names = Outputs("Out"); + std::vector ins; + reader->ReadNext(&ins); + PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); + for (size_t i = 0; i < ins.size(); ++i) { + auto* out = + scope.FindVar(out_arg_names[i])->GetMutable(); + out->ShareDataWith(ins[i]); + out->set_lod(ins[i].lod()); + } + } +}; + +class ReadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput("Reader", "(ReaderHolder) The executed reader."); + AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable(); + AddComment(R"DOC( + Read Operator + + Execute a given reader once and output data. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker, + paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4b9b8dab9b0394752d538aa5f59be3c06d0188f --- /dev/null +++ b/paddle/fluid/operators/recurrent_op.cc @@ -0,0 +1,635 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +constexpr char kInputs[] = "inputs"; +constexpr char kInitialStates[] = "initial_states"; +constexpr char kParameters[] = "parameters"; +constexpr char kOutputs[] = "outputs"; +constexpr char kStepScopes[] = "step_scopes"; +constexpr char kExStates[] = "ex_states"; +constexpr char kStates[] = "states"; +constexpr char kStepBlock[] = "sub_block"; +constexpr char kReverse[] = "reverse"; +constexpr char kIsTrain[] = "is_train"; +#define GRAD_SUFFIX "@GRAD" +constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX; +constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX; +constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX; +constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX; + +using StepScopeVar = std::vector; + +// StepScopes manages scopes inside RNN. +// StepScopes::CurScope() get the current scope +// StepScopes::ExScope() get the ex-scope, or scope in previous time step. +// StepScopes::Next() move to next time step. +// +// if is_train = False, then +// there are two scopes for the RNN and just support forward. +// else +// the len(scopes) == seq_len +// +// if is_backward = True, then +// reversely access scopes +// else +// access scopes from begin to end. +class StepScopes { + public: + StepScopes(const framework::Scope &parent, StepScopeVar *scopes, + bool is_train, size_t seq_len, bool is_backward = false) + : counter_(is_backward ? seq_len - 1 : 0UL), + scopes_(scopes), + is_train_(is_train), + is_backward_(is_backward) { + size_t num_step_scopes = is_train ? seq_len : 2; + PADDLE_ENFORCE(is_train || !is_backward, + "Cannot backward when is not training"); + if (!is_backward_) { + PADDLE_ENFORCE(scopes->empty()); + scopes->reserve(static_cast(num_step_scopes)); + for (size_t i = 0; i < num_step_scopes; ++i) { + scopes->emplace_back(&parent.NewScope()); + } + } + } + + framework::Scope &CurScope() { return GetScope(counter_); } + + framework::Scope &ExScope() { + auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1); + return scope; + } + + void Next() { + if (is_backward_) { + --counter_; + } else { + ++counter_; + } + } + + private: + framework::Scope &GetScope(size_t scope_id) const { + if (!is_train_) { + scope_id %= 2; + } + PADDLE_ENFORCE_LT(scope_id, scopes_->size()); + return *(*scopes_)[scope_id]; + } + + size_t counter_; + StepScopeVar *scopes_; + bool is_train_; + bool is_backward_; +}; + +// Base class for RecurrentOp/RecurrentGradOp +// Some common protected functions for RecurrentOp/RecurrentGradOp +class RecurrentBase : public framework::OperatorBase { + public: + RecurrentBase(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + // Get SequenceLength from Scope + // The sequence length is got from input tensor. The input tensor's + // dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape + // is SEQ_LEN. The second of the tensor's shape could be the batch size or + // nested sequence length. + int64_t GetSequenceLength(const framework::Scope &scope) const { + // Dim format SEQ_LEN, BATCH_SIZE, ... + int64_t seq_len = -1; + auto &all_inputs = Inputs(kInputs); + PADDLE_ENFORCE(!all_inputs.empty()); + for (auto &iname : all_inputs) { + auto *var = scope.FindVar(iname); + PADDLE_ENFORCE(var != nullptr); + PADDLE_ENFORCE(var->IsType()); + auto &dim = var->Get().dims(); + if (seq_len == -1) { + seq_len = dim[0]; + } else { + PADDLE_ENFORCE_EQ(seq_len, dim[0]); + } + } + return seq_len; + } + + // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars), + // map(dst_scope.Var, dst_vars)): + // dst_tensor.ShareDataWith(src_tensor) + static void LinkTensor(const framework::Scope &src_scope, + const std::vector &src_vars, + framework::Scope *dst_scope, + const std::vector &dst_vars) { + LinkTensorWithCallback( + src_scope, src_vars, dst_scope, dst_vars, + [&](const framework::Tensor &src, framework::Tensor *dst) { + dst->ShareDataWith(src); + }); + } + + // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars), + // map(dst_scope.Var, dst_vars)): + // callback(src_tensor, &dst_tensor) + template + static void LinkTensorWithCallback(const framework::Scope &src_scope, + const std::vector &src_vars, + framework::Scope *dst_scope, + const std::vector &dst_vars, + Callback callback) { + PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); + for (size_t i = 0; i < dst_vars.size(); ++i) { + VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + } + } + + // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars), + // map(dst_scope.FindVar, dst_vars)): + // callback(src_tensor, &dst_tensor) + template + static void LinkTensorWithCallback(const framework::Scope &src_scope, + const std::vector &src_vars, + const framework::Scope &dst_scope, + const std::vector &dst_vars, + Callback callback) { + PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); + for (size_t i = 0; i < dst_vars.size(); ++i) { + VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + } + } + + // (seq_len, shape) -> return [seq_len] + list(shape) + static framework::DDim PrependDims(size_t seq_len, + const framework::DDim &src) { + auto dims = framework::vectorize(src); + dims.insert(dims.begin(), static_cast(seq_len)); + return framework::make_ddim(dims); + } + + private: + template + static void AccessTensor(const framework::Scope &src_scope, + const std::string &src_var_name, + framework::Scope *dst_scope, + const std::string &dst_var_name, Callback callback) { + auto *src_var = src_scope.FindVar(src_var_name); + PADDLE_ENFORCE(src_var != nullptr); + auto &src_tensor = src_var->Get(); + + auto *dst_var = dst_scope->Var(dst_var_name); + auto *dst_tensor = dst_var->GetMutable(); + callback(src_tensor, dst_tensor); + } + + template + static void AccessTensor(const framework::Scope &src_scope, + const std::string &src_var_name, + const framework::Scope &dst_scope, + const std::string &dst_var_name, Callback callback) { + auto *src_var = src_scope.FindVar(src_var_name); + PADDLE_ENFORCE(src_var != nullptr); + auto &src_tensor = src_var->Get(); + auto *dst_var = dst_scope.FindVar(dst_var_name); + PADDLE_ENFORCE(dst_var != nullptr); + auto *dst_tensor = dst_var->GetMutable(); + callback(src_tensor, dst_tensor); + } +}; + +class RecurrentOp : public RecurrentBase { + public: + RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : RecurrentBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto seq_len = static_cast(this->GetSequenceLength(scope)); + VLOG(3) << "Static RNN input sequence length = " << seq_len; + StepScopes scopes = CreateStepScopes(scope, seq_len); + auto reverse = Attr(kReverse); + + framework::Executor executor(place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + for (size_t i = 0; i < seq_len; ++i) { + size_t seq_offset = reverse ? seq_len - i - 1 : i; + VLOG(3) << "Recurrent operate at the time step " << seq_offset; + + auto &cur_scope = scopes.CurScope(); + + // Link outside::input --> inside::input + // inside::input = outside::input[seq_offset: seq_offset+1] + LinkTensorWithCallback( + scope, Inputs(kInputs), &cur_scope, Inputs(kInputs), + [&seq_offset](const framework::Tensor &outside, + framework::Tensor *inside) { + inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1)); + auto dims = framework::vectorize(inside->dims()); + dims.erase(dims.begin()); + inside->Resize(framework::make_ddim(dims)); + }); + + if (i == 0) { + // Link initial states --> ex_states + LinkTensor(scope, Inputs(kInitialStates), &cur_scope, + Attr>(kExStates)); + } else { + auto &ex_scope = scopes.ExScope(); + // Link ex_scope::state --> cur_scope::ex_state + LinkTensor(ex_scope, Attr>(kStates), + &cur_scope, Attr>(kExStates)); + } + + // Every inputs are linked now, execute! + executor.Run(*program, &cur_scope, block->ID(), + false /*create_local_scope*/); + + // get device context from pool + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // Copy inside::output -> outside::output + // outside::output[seq_offset: seq_offset + 1] = inside::output + this->LinkTensorWithCallback( + cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + if (i == 0) { // create output tensor at begin + dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims())); + dst_tensor->mutable_data(place, src_tensor.type()); + } + + auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); + // Explicit copy output since the local RNN scope can be destroyed + // early. + framework::Copy(src_tensor, place, dev_ctx, &dst_out); + }); + + scopes.Next(); + } + } + + private: + StepScopes CreateStepScopes(const framework::Scope &scope, + size_t seq_len) const { + auto *var = scope.FindVar(Output(kStepScopes)); + PADDLE_ENFORCE(var != nullptr); + return StepScopes(scope, var->GetMutable(), + Attr(kIsTrain), seq_len); + } +}; + +class RecurrentGradOp : public RecurrentBase { + public: + RecurrentGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : RecurrentBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto seq_len = static_cast(GetSequenceLength(scope)); + StepScopes scopes = CreateStepScopes(scope, seq_len); + auto reverse = Attr(kReverse); + + framework::Executor executor(place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t step_id = 0; step_id < seq_len; ++step_id) { + size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; + VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; + auto &cur_scope = scopes.CurScope(); + // Link outside::output_grads --> inside::output_grads + // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] + LinkTensorWithCallback( + scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads), + [&](const framework::Tensor &outside, framework::Tensor *inside) { + inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1)); + auto dims = framework::vectorize(inside->dims()); + dims.erase(dims.begin()); + inside->Resize(framework::make_ddim(dims)); + }); + auto og_set = List2Set(Inputs(kOutputGrads)); + + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + std::copy(og_set.begin(), og_set.end(), + std::ostream_iterator(sout, ",")); + VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; + } + + // Link states + // if cur_scope::cur_state_grad in out_grads: + // cur_scope::cur_state_grad += ex_scope::ex_state_grad + // else: + // ex_scope::ex_state_grad --> cur_scope::cur_state_grad + if (step_id != 0) { // not at beginning + auto &ex_scope = scopes.ExScope(); + auto ex_state_grads = + GradVarLists(Attr>(kExStates)); + auto cur_state_grads = + GradVarLists(Attr>(kStates)); + + PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size()); + for (size_t i = 0; i < ex_state_grads.size(); ++i) { + auto &cur_grad = cur_state_grads[i]; + auto &ex_grad = ex_state_grads[i]; + auto &ex_tensor = + ex_scope.FindVar(ex_grad)->Get(); + + VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; + auto *cur_grad_var = cur_scope.Var(cur_grad); + auto cur_grad_tensor = + cur_grad_var->GetMutable(); + framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor); + } + } + + VLOG(5) << "Recurrent memory linking finished "; + // Run step block with cur_scope + executor.Run(*program, &cur_scope, block->ID(), + false /*create_local_scope*/); + + VLOG(5) << "executor.Run finished "; + + auto local_var_names = LocalVarNames(cur_scope); + + // Accumulate params + // if (step == 0): + // outside::param_grad = 0.0 + // outside::param_grad += inside::param_grad + { + auto &pg_names = Outputs(kParamGrads); + auto &p_names = Inputs(kParameters); + PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); + + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + auto inside_grad_name = framework::GradVarName(p_names[param_id]); + + // If does not compute gradient of that variable inside rnn, just + // continue + if (local_var_names.find(inside_grad_name) == local_var_names.end()) { + continue; + } + + // zero gradient variable in step 0 + if (step_id == 0) { + auto &inside_tensor = cur_scope.FindVar(inside_grad_name) + ->Get(); + framework::AttributeMap attrs; + attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["value"] = 0.0f; + + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", framework::VariableNameMap{}, + {{"Out", {pg_names[param_id]}}}, attrs); + zero_op->Run(scope, place); + } + + auto new_inside_name = cur_scope.Rename(inside_grad_name); + // sum gradient + + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + sum_op->Run(cur_scope, place); + + cur_scope.Rename(new_inside_name, inside_grad_name); + } + } + VLOG(5) << "Accumulate Parameter finished "; + + // Copy input gradient from inside to outside + // outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad + LinkTensorWithCallback( + cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads), + [&](const framework::LoDTensor &inside, + framework::LoDTensor *outside) { + if (inside.memory_size() == 0) { // IG is not created. + return; + } + if (step_id == 0) { // alloc memory + outside->Resize(PrependDims(seq_len, inside.dims())); + outside->mutable_data(place, inside.type()); + } + + auto dst = outside->Slice(seq_offset, seq_offset + 1); + framework::Copy(inside, place, dev_ctx, &dst); + }); + VLOG(5) << "Link outside gradient finished "; + + if (step_id + 1 == seq_len) { // at_end + // copy initialize states gradient from inside to outside + LinkTensorWithCallback( + cur_scope, GradVarLists(Attr>(kExStates)), + scope, Outputs(kInitStateGrads), + [&](const framework::LoDTensor &inside, + framework::LoDTensor *outside) { + outside->Resize(inside.dims()); + outside->mutable_data(place, inside.type()); + framework::Copy(inside, place, dev_ctx, outside); + }); + VLOG(5) << "Link initialize state gradient finished "; + } + scopes.Next(); + } + } + + private: + StepScopes CreateStepScopes(const framework::Scope &scope, + size_t seq_len) const { + auto *var = scope.FindVar(Input(kStepScopes)); + PADDLE_ENFORCE(var != nullptr); + return StepScopes(scope, var->GetMutable(), + Attr(kIsTrain), seq_len, true /*is_backward*/); + } + + std::unordered_set List2Set( + const std::vector &list) const { + std::unordered_set local_var_name_set; + local_var_name_set.reserve(list.size()); + for (auto &each : list) { + local_var_name_set.insert(each); + } + return local_var_name_set; + } + + std::unordered_set LocalVarNames( + const framework::Scope &scope) const { + return this->List2Set(scope.LocalVarNames()); + } + static std::vector GradVarLists( + const std::vector &var_names) { + std::vector retv; + retv.reserve(var_names.size()); + std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv), + framework::GradVarName); + return retv; + } +}; + +class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInputs, "rnn inputs").AsDuplicable(); + AddInput(kInitialStates, "rnn initial states").AsDuplicable(); + AddInput(kParameters, + "Parameters are used by step block as its input. However, the " + "input is not a sequence tensor. Every time step, each operator " + "in step block just use the parameter directly.") + .AsDuplicable(); + AddOutput(kOutputs, + "The output sequence of RNN. The sequence length must be same.") + .AsDuplicable(); + AddOutput(kStepScopes, + "StepScopes contain all local variables in each time step."); + AddAttr>(kExStates, + string::Sprintf( + R"DOC(The ex-state variable names. +The ex-state means the state value in the ex-timestep or the previous time step +[%s, %s, %s] must be the same order)DOC", + kExStates, kStates, kInitStateGrads)); + AddAttr>( + kStates, + string::Sprintf( + "The state variable names. [%s, %s, %s] must be the same order", + kExStates, kStates, kInitStateGrads)); + AddAttr(kStepBlock, "The step block inside RNN"); + AddAttr(kReverse, R"DOC(Calculate RNN reversely or not. +By default reverse=False + +Assume the input data is [A, B, C, D] + +if reverse is False: + the computation of RNN is like + A B C D + | | | | + v v v v + rnn -----> rnn -----> rnn ----> rnn + | | | | + v v v v + o o o o + +if reverse is True + the computation of RNN is like + A B C D + | | | | + v v v v + rnn <----- rnn <----- rnn <---- rnn + | | | | + v v v v + o o o o +)DOC").SetDefault(false); + AddAttr(kIsTrain, "").SetDefault(true); + AddComment(R"DOC( +Static Length Recurrent Operator. + +The static length recurrent operator can only operate on fixed size sequence +data, i.e. in each mini-batch, the sequence length of all inputs are the same. + +)DOC"); + } +}; + +class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + virtual std::unique_ptr Apply() const { + auto *grad = new framework::OpDesc(); + grad->SetType("recurrent_grad"); + for (auto &input_param : this->InputNames()) { + grad->SetInput(input_param, this->Input(input_param)); + grad->SetOutput(framework::GradVarName(input_param), + this->InputGrad(input_param, false)); + } + + for (auto &output_param : this->OutputNames()) { + if (output_param == kStepScopes) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(framework::GradVarName(output_param), + this->Output(output_param)); + } else { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(framework::GradVarName(output_param), + this->OutputGrad(output_param)); + } + } + grad->SetAttrMap(this->Attrs()); + grad->SetBlockAttr(kStepBlock, *grad_block_[0]); + + return std::unique_ptr(grad); + } +}; + +class RecurrentGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + std::vector input{kInputs, kInitialStates}; + std::vector output{kOutputs}; + for (auto &s : input) { + PADDLE_ENFORCE(ctx->HasInputs(s)); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)), + "Cannot find the gradient variable %s", + framework::GradVarName(s)); + } + for (auto &s : output) { + PADDLE_ENFORCE(ctx->HasInputs(s)); + } + for (auto &s : input) { + ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s)); + } + if (ctx->HasInputs(kParameters)) { + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); + ctx->SetOutputsDim(framework::GradVarName(kParameters), + ctx->GetInputsDim(kParameters)); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(recurrent, paddle::operators::RecurrentOp, + paddle::operators::RecurrentOpProtoMaker, + paddle::operators::RecurrentGradOpDescMaker); +REGISTER_OPERATOR(recurrent_grad, paddle::operators::RecurrentGradOp, + paddle::operators::RecurrentGradOpShapeInference); diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c093f60ceed4171ee4ab7f0e5757af2ee5950270 --- /dev/null +++ b/paddle/fluid/operators/recv_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +namespace paddle { +namespace operators { + +class RecvOp : public framework::OperatorBase { + public: + RecvOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + auto outs = Outputs("Out"); + std::vector epmap = Attr>("epmap"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + for (size_t i = 0; i < outs.size(); i++) { + VLOG(3) << "getting " << outs[i]; + client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + } + PADDLE_ENFORCE(client_.Wait()); + } + + private: + mutable detail::RPCClient client_; +}; + +class RecvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable(); + AddComment(R"DOC( +Recv operator + +This operator can get variables from server side. +)DOC"); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4d9d4cc07b1f76ed04e17bc1cc65293163fb6f2 --- /dev/null +++ b/paddle/fluid/operators/reduce_op.cc @@ -0,0 +1,214 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class ReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReduceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReduceOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + int dim = ctx->Attrs().Get("dim"); + if (dim < 0) dim = x_rank + dim; + PADDLE_ENFORCE_LT( + dim, x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keep_dim || x_rank == 1) { + dims_vector[dim] = 1; + } else { + dims_vector.erase(dims_vector.begin() + dim); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dim != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + int dim = ctx->Attrs().Get("dim"); + if (dim < 0) dim = x_rank + dim; + PADDLE_ENFORCE_LT( + dim, x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); + } + } +}; + +class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr( + "dim", + "(int, default 0) The dimension to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim < 0`, the dim to reduce is `rank + dim`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault(0); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + comment_ = R"DOC( +{ReduceOp} Operator. + +This operator computes the {reduce} of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. + +)DOC"; + AddComment(comment_); + } + + protected: + std::string comment_; + + void Replace(std::string &src, std::string from, std::string to) { + std::size_t len_from = std::strlen(from.c_str()); + std::size_t len_to = std::strlen(to.c_str()); + for (std::size_t pos = src.find(from); pos != std::string::npos; + pos = src.find(from, pos + len_to)) { + src.replace(pos, len_from, to); + } + } + + void SetComment(std::string name, std::string op) { + Replace(comment_, "{ReduceOp}", name); + Replace(comment_, "{reduce}", op); + } +}; + +class ReduceSumOpMaker : public ReduceOpMaker { + public: + ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceSum", "sum"); + AddComment(comment_); + } +}; + +class ReduceMeanOpMaker : public ReduceOpMaker { + public: + ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceMean", "mean"); + AddComment(comment_); + } +}; + +class ReduceMaxOpMaker : public ReduceOpMaker { + public: + ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceMax", "max"); + AddComment(comment_); + } +}; + +class ReduceMinOpMaker : public ReduceOpMaker { + public: + ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceMin", "min"); + AddComment(comment_); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad, + ops::ReduceGradOp); + +REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker, + reduce_mean_grad, ops::ReduceGradOp); + +REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, + ops::ReduceGradOp); + +REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, + ops::ReduceGradOp); + +#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL(reduce_type, \ + ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel); \ + REGISTER_OP_CPU_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1ca107ebfe9b617bd5e952965543549a8d92a5b1 --- /dev/null +++ b/paddle/fluid/operators/reduce_op.cu @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/reduce_op.h" + +namespace ops = paddle::operators; + +#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type, ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel); \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a153cf272b5dd8abcba1bc7d3d02c480702eae4d --- /dev/null +++ b/paddle/fluid/operators/reduce_op.h @@ -0,0 +1,257 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "glog/logging.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +template +using EigenTensor = framework::EigenTensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +struct SumFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.sum(dim); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + dx.device(place) = dy.broadcast(dim); + } +}; + +struct MeanFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.mean(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + dx.device(place) = dy.broadcast(dim) / dx.constant(size); + } +}; + +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.maximum(dim); + } +}; + +struct MinFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.minimum(dim); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + auto equals = x == y.broadcast(dim); + auto ones = dx.constant(1); + auto zeros = dx.constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros); + } +}; + +template +class ReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + if (reduce_all) { + // Flatten and reduce 1-D tensor + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + auto x = EigenVector::Flatten(*input); + auto out = EigenScalar::From(*output); + auto& place = + *context.template device_context().eigen_device(); + auto reduce_dim = Eigen::array({{0}}); + Functor functor; + functor(place, x, out, reduce_dim); + } else { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + ReduceCompute<1>(context); + break; + case 2: + ReduceCompute<2>(context); + break; + case 3: + ReduceCompute<3>(context); + break; + case 4: + ReduceCompute<4>(context); + break; + case 5: + ReduceCompute<5>(context); + break; + case 6: + ReduceCompute<6>(context); + break; + } + } + } + + private: + template + void ReduceCompute(const framework::ExecutionContext& context) const { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto x = EigenTensor::From(*input); + auto x_rank = static_cast(x.dimensions().size()); + int dim = static_cast(context.Attr("dim")); + if (dim < 0) dim = x_rank + dim; + auto reduce_dim = Eigen::array({{dim}}); + // construct the squeezed output tensor + bool keep_dim = context.Attr("keep_dim"); + DDim dims = output->dims(); + auto dims_vector = vectorize(dims); + if (keep_dim && x_rank > 1) { + dims_vector.erase(dims_vector.begin() + dim); + dims = framework::make_ddim(dims_vector); + } + + auto& place = + *context.template device_context().eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, x, out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, dims); + functor(place, x, out, reduce_dim); + } + } +}; + +template +class ReduceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + if (reduce_all) { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + auto x = EigenVector::Flatten(*input0); + auto x_reduce = EigenVector::From(*input1); + auto x_reduce_grad = EigenVector::From(*input2); + auto x_grad = EigenVector::Flatten(*output); + auto& place = + *context.template device_context().eigen_device(); + auto broadcast_dim = + Eigen::array({{static_cast(input0->numel())}}); + Functor functor; + functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, + broadcast_dim[0]); + } else { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + ReduceGradCompute<1>(context); + break; + case 2: + ReduceGradCompute<2>(context); + break; + case 3: + ReduceGradCompute<3>(context); + break; + case 4: + ReduceGradCompute<4>(context); + break; + case 5: + ReduceGradCompute<5>(context); + break; + case 6: + ReduceGradCompute<6>(context); + break; + } + } + } + + private: + template + void ReduceGradCompute(const framework::ExecutionContext& context) const { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + + output->mutable_data(context.GetPlace()); + auto x = EigenTensor::From(*input0); + auto x_grad = EigenTensor::From(*output); + auto x_rank = static_cast(x.dimensions().size()); + int dim = static_cast(context.Attr("dim")); + if (dim < 0) dim = x_rank + dim; + DDim dims = input0->dims(); + dims[dim] = 1; + auto x_reduce = EigenTensor::From(*input1, dims); + auto x_reduce_grad = EigenTensor::From(*input2, dims); + + Eigen::array broadcast_dim; + for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + broadcast_dim[dim] = input0->dims()[dim]; + auto& place = + *context.template device_context().eigen_device(); + Functor functor; + functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, + broadcast_dim[dim]); + } +}; + +} // namespace operators +} // namespace paddle + +#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ + __macro(reduce_sum, SumFunctor, SumGradFunctor); \ + __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ + __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ + __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..148a65bb4b7fe599f2fdb833c179665e58fe1c41 --- /dev/null +++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc @@ -0,0 +1,270 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class ReorderLoDTensorByRankTableOpProtoMaker + : public framework::OpProtoAndCheckerMaker { + public: + ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto, + OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), the input lod tensor to be reordered according to " + "Input(RankTable)."); + AddInput("RankTable", + "(LoDRankTable), the rank table according to which Input(X) is " + "reordered."); + AddOutput("Out", "(LoDTensor), the reordered lod tensor."); + AddComment(R"DOC(ReorderLoDTensorByRankTable operator. + +Input(X) is a batch of sequences. Input(RankTable) stores new orders of the +input sequence batch. The reorder_lod_tensor_by_rank operator reorders the +Input(X) according to the information provided by Input(RankTable). + +For example: + +If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the +Input(X) will be reordered that the fourth sequence in Input(X) will become the +first one, and then followed by the original first, third, and the second one. + +This is: +X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1]. +Out = [Seq3, Seq0, Seq2, Seq1] with a new LoD information. + +If the LoD information of Input(X) is empty, this means Input(X) is not sequence +data. This is also identical to a batch of sequences where each sequence has a +fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders +each slice of Input(X) along the first axis according to Input(RankTable). + +This is: +X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The +indices in RankTable are [3, 0, 2, 1]. +Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended. + +NOTE: This operator sorts Input(X) according to a given LoDRankTable which does +not need to be calculated according to Input(X). It can be calculated according +to another different sequence, and then this operator sorts Input(X) according +to the given LoDRankTable. + +)DOC"); + } +}; + +class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { + public: + ReorderLoDTensorByRankTableBase(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = + detail::Ref(scope.FindVar(Input("X")), + "Cannot find input lod tensor variable %s", Input("X")) + .Get(); + auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")), + "Cannot find input rank table variable %s", + Input("RankTable")) + .Get(); + auto &out = + *detail::Ref(scope.FindVar(Output("Out")), + "Cannot find output lod tensor variable %s", Output("Out")) + .GetMutable(); + + out.Resize(x.dims()); + out.mutable_data(x.place(), x.type()); + this->process(place, x, rank_table, &out); + } + + protected: + virtual void process(const platform::Place &place, + const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const = 0; + + struct AbsoluteRankTableItem { + size_t offset; // the absolute/accumulated offset. + size_t length; // the length + framework::LoD lod; + }; + + std::vector GetAbsoluteOffsetAndLengthByLoDRankTable( + const framework::LoDTensor &x) const { + std::vector absolute_table; + + if (x.lod().empty()) { + // For Tensor without lod, such as the output of sequence_pool_op + size_t size = x.dims()[0]; + absolute_table.reserve(size); + for (size_t i = 0; i < size; ++i) { + absolute_table.emplace_back(); + absolute_table.back().length = 1; + absolute_table.back().offset = i; + } + } else { + size_t level = 0; + size_t size = x.lod()[level].size(); + + for (size_t i = 0; i < size - 1; ++i) { + auto lod_offset = + framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level); + + auto &offset = lod_offset.second; + + absolute_table.emplace_back(); + absolute_table.back().length = offset.second - offset.first; + absolute_table.back().offset = offset.first; + absolute_table.back().lod = lod_offset.first; + } + } + + return absolute_table; + } + + size_t CopyTensorAndLod(const platform::Place &place, + const AbsoluteRankTableItem &item, + const framework::LoDTensor &x, + framework::LoDTensor *out, size_t out_offset) const { + auto &out_lod = *out->mutable_lod(); + auto len = item.length; + auto x_offset = item.offset; + + if (out_lod.empty()) { + for (size_t i = 0; i < item.lod.size(); ++i) { + out_lod.push_back(std::vector({0})); + } + } + + for (size_t i = 0; i < out_lod.size(); ++i) { + auto &out_v = out_lod[i]; + auto &new_lod_v = item.lod[i]; + + for (auto &detail : new_lod_v) { + out_v.push_back(out_v.back() + detail); + } + } + + auto x_sliced = x.Slice(x_offset, x_offset + len); + auto out_sliced = out->Slice(out_offset, out_offset + len); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); + out_offset += len; + return out_offset; + } +}; + +class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase { + public: + ReorderLoDTensorByRankTableOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {} + + protected: + void process(const platform::Place &place, const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const override { + auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x); + size_t out_offset = 0; + out->mutable_lod()->clear(); + for (auto &item : rank_table.items()) { + PADDLE_ENFORCE_LT(item.index, absolute_table.size()); + out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out, + out_offset); + } + } +}; + +class IdentityInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ReorderLodTensorByRankGradOpMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("reorder_lod_tensor_by_rank_grad"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetInput("RankTable", Input("RankTable")); + return std::unique_ptr(grad_op); + } +}; + +class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase { + public: + ReorderLoDTensorByRankGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {} + + protected: + void process(const platform::Place &place, const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const override { + auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x); + + // offsets = enumerate([item.index for item in rank_table.items()]) + std::vector> offsets; + offsets.reserve(rank_table.items().size()); + for (size_t i = 0; i < rank_table.items().size(); ++i) { + offsets.push_back({i, rank_table.items()[i].index}); + } + + // offsets.sort(key=lambda x: x[1]) + std::sort( + offsets.begin(), offsets.end(), + [](const std::pair &a, + const std::pair &b) { return a.second < b.second; }); + + // Copy TensorAndLod + size_t out_offset = 0; + for (auto &offset : offsets) { + out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first], + x, out, out_offset); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(reorder_lod_tensor_by_rank, + ops::ReorderLoDTensorByRankTableOp, + ops::ReorderLodTensorByRankGradOpMaker, + ops::ReorderLoDTensorByRankTableOpProtoMaker, + ops::IdentityInferShape); +REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad, + ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4f80cc06abaa536d1b1097850047fd370246dee --- /dev/null +++ b/paddle/fluid/operators/reshape_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/reshape_op.h" + +namespace paddle { +namespace operators { + +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + auto shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty."); + auto x_dims = ctx->GetInputDim("X"); + + std::vector neg_dims_idx; + // set some dimension to -1 if it is unknown + const int unknown_size = -1; + for (size_t i = 0; i < shape.size(); ++i) { + PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size, + "Each dimension of Attr(shape) must be positive or %d.", + unknown_size); + if (shape[i] == unknown_size) { + neg_dims_idx.push_back(i); + PADDLE_ENFORCE(neg_dims_idx.size() <= 1, + "Only one dimension of Attr(shape) can be unknown."); + } + } + + int64_t capacity = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + int64_t in_size = framework::product(x_dims); + if (neg_dims_idx.size() == 1) { + // dim infer + shape[neg_dims_idx[0]] = in_size / (-capacity); + // recalculate capacity + capacity = shape[neg_dims_idx[0]] * (-capacity); + } + // capacity check + PADDLE_ENFORCE(capacity == in_size, + "The size of Input(X) mismatches with Attr(shape)."); + // resize output + std::vector shape_int64(shape.size(), 0); + std::transform(shape.begin(), shape.end(), shape_int64.begin(), + [](int a) { return static_cast(a); }); + auto out_dims = framework::make_ddim(shape_int64); + ctx->SetOutputDim("Out", out_dims); + if (shape[0] == x_dims[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } +}; + +class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of reshape operator."); + AddOutput("Out", "The output tensor of reshape operator."); + AddAttr>("shape", + "(vector) " + "Target shape of reshape operator."); + AddComment(R"DOC( +Reshape Operator. + +Reshape Input(X) into the shape specified by Attr(shape). + +An example: +Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]] + +and target shape = [1, 4], the reshape operator will transform +the tensor X into a 2-D tensor: [[1, 2, 3, 4]] + +One dimension in the target shape can be set -1, representing that its +size is unknown. In this case, the real dimension will be infered from +the original shape of Input(X) and other dimensions in the target shape. +)DOC"); + } +}; + +class ReshapeGradOp : public framework::OperatorWithKernel { + public: + ReshapeGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad, + ops::ReshapeGradOp); +REGISTER_OP_CPU_KERNEL(reshape, + ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL( + reshape_grad, ops::ReshapeGradKernel); diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f9ae6da29e54187b2d6aedb833a2aa4ca95cacba --- /dev/null +++ b/paddle/fluid/operators/reshape_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/reshape_op.h" + +REGISTER_OP_CUDA_KERNEL( + reshape, + paddle::operators::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL( + reshape_grad, + paddle::operators::ReshapeGradKernel); diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a17ba7c619490977b837c565ef1f4cc0780d5c61 --- /dev/null +++ b/paddle/fluid/operators/reshape_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ReshapeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out = ctx.Output("Out"); + auto* in = ctx.Input("X"); + auto out_dims = out->dims(); + out->mutable_data(ctx.GetPlace()); + framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out); + out->Resize(out_dims); + } +}; + +template +class ReshapeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); + + auto in_dims = d_x->dims(); + framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); + d_x->Resize(in_dims); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..06d3ccafefd4cc163b806aeb5d2a582c686f10cb --- /dev/null +++ b/paddle/fluid/operators/rmsprop_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/rmsprop_op.h" + +namespace paddle { +namespace operators { + +class RmspropOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("MeanSquare"), + "Input(MeanSquare) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of RmspropOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(param_out) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(Momentum_out) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), + "Output(MeanSquareOut) of RmspropOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and grad input of RmspropOp should have the same dimension."); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"), + "Param and Momentum input of RmspropOp " + "should have the same dimension."); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"), + "Param and Momentum input of RmspropOp " + "should have the same dimension."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("MomentOut", param_dim); + ctx->SetOutputDim("MeanSquareOut", param_dim); + } +}; + +class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter value that has to be updated."); + AddInput("MeanSquare", + "(Tensor, default Tensor)" + " The mean square value that gets updated."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("Moment", + "(Tensor, default Tensor) The moment that gets updated."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + AddOutput("MomentOut", "(Tensor) Output updated moment."); + AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); + + AddAttr("epsilon", + "(float, default 1e-10) Constant " + "for numerical stability.") + .SetDefault(1.0e-10f); + AddAttr("decay", + "(float, default 0.9) " + "Discounting factor for coming gradient.") + .SetDefault(0.9f); + AddAttr("momentum", "(float, default 0.0) Constant value.") + .SetDefault(0.0f); + AddComment(R"DOC( +Rmsprop Optimizer. + +$$ +MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\ +MomentOut = momentum * Moment + + \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\ +ParamOut = Param - MomentOut +$$ + +The original slides that proposed Rmsprop: Slide 29 of +http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); +REGISTER_OP_CPU_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/fluid/operators/rmsprop_op.cu b/paddle/fluid/operators/rmsprop_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a909c942791d2e2e4d9887d4c9265383a93ca137 --- /dev/null +++ b/paddle/fluid/operators/rmsprop_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/rmsprop_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h new file mode 100644 index 0000000000000000000000000000000000000000..469c102a4721ca45026112e3166dc0807ba93292 --- /dev/null +++ b/paddle/fluid/operators/rmsprop_op.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class RmspropOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + auto* moment_out = ctx.Output("MomentOut"); + auto* mean_square_out = ctx.Output("MeanSquareOut"); + + auto grad = ctx.Input("Grad"); + + param_out->mutable_data(ctx.GetPlace()); + moment_out->mutable_data(ctx.GetPlace()); + mean_square_out->mutable_data(ctx.GetPlace()); + + float epsilon = ctx.Attr("epsilon"); + float rho = ctx.Attr("decay"); + float momentum = ctx.Attr("momentum"); + + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto ms = EigenVector::Flatten(*ctx.Input("MeanSquare")); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + auto g = EigenVector::Flatten(*grad); + auto mom = EigenVector::Flatten(*ctx.Input("Moment")); + + auto p_out = EigenVector::Flatten(*param_out); + auto mom_out = EigenVector::Flatten(*moment_out); + auto ms_out = EigenVector::Flatten(*mean_square_out); + auto& place = *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + ms_out.device(place) = rho * ms + (1 - rho) * g * g; + mom_out.device(place) = + momentum * mom + + lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + p_out.device(place) = p - mom_out; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..504456c4b069f81319893ae51f57503f5025761a --- /dev/null +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class RNNMemoryHelperOp : public framework::OperatorBase { + public: + RNNMemoryHelperOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto mem_var_name = Input("X"); + auto *mem_var = scope.FindVar(mem_var_name); + PADDLE_ENFORCE(mem_var != nullptr, + "Cannot find mem_var in scope, mem_var_name is %s", + mem_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto *out_tensor = out_var->GetMutable(); + auto &mem_tensor = mem_var->Get(); + out_tensor->ShareDataWith(mem_tensor); + out_tensor->set_lod(mem_tensor.lod()); + } +}; + +class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Out"), ""); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddOutput("Out", ""); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddComment(""); + } +}; + +class RNNMemoryHelperGradOp : public framework::OperatorBase { + public: + RNNMemoryHelperGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto out_grad_var_name = Input(framework::GradVarName("Out")); + auto *out_grad_var = scope.FindVar(out_grad_var_name); + + auto in_grad_var_name = Output(framework::GradVarName("X")); + auto *in_grad_var = scope.FindVar(in_grad_var_name); + PADDLE_ENFORCE(in_grad_var != nullptr, + "Cannot find in_grad_var in scope, name is %s", + in_grad_var_name); + + if (out_grad_var == nullptr) { + VLOG(5) << "Using fill constant 0 as starting gradient"; + auto in_var_name = Input("X"); + auto *in_var = scope.FindVar(in_var_name); + auto &in_var_tensor = in_var->Get(); + + framework::AttributeMap attrs; + attrs["dtype"] = framework::ToDataType(in_var_tensor.type()); + attrs["shape"] = framework::vectorize2int(in_var_tensor.dims()); + attrs["value"] = 0.0f; + + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs); + zero_op->Run(scope, dev_place); + } else { + auto &out_grad_tensor = out_grad_var->Get(); + auto *in_grad_tensor = in_grad_var->GetMutable(); + in_grad_tensor->ShareDataWith(out_grad_tensor); + in_grad_tensor->set_lod(out_grad_tensor.lod()); + } + } +}; + +class RNNMemoryHelperGradOpInfoMaker + : public framework::OpProtoAndCheckerMaker { + public: + RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(framework::GradVarName("Out"), ""); + AddInput("X", ""); + AddInput("Out", ""); + AddOutput(framework::GradVarName("X"), ""); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddComment(""); + } +}; + +class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + auto x_grad_name = framework::GradVarName("X"); + PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), ""); + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ x_grad_name); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(rnn_memory_helper, paddle::operators::RNNMemoryHelperOp, + paddle::operators::RNNMemoryHelperOpInfoMaker, + paddle::operators::RNNMemoryHelperOpShapeInference, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(rnn_memory_helper_grad, + paddle::operators::RNNMemoryHelperGradOp, + paddle::operators::RNNMemoryHelperGradOpInfoMaker, + paddle::operators::RNNMemoryHelperGradOpShapeInference); diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..09238f89a775979b8b1866d410e6ad1ef772f9d7 --- /dev/null +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/roi_pool_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kROISize = 5; + +class ROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of ROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Argmax"), + "Output(Argmax) of ROIPoolOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW."); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]."); + PADDLE_ENFORCE(rois_dims[1] == kROISize, + "ROIs should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]."); + + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0"); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + ctx->SetOutputDim("Out", out_dims); + ctx->SetOutputDim("Argmax", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class ROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor), " + "the input of ROIPoolOp. " + "The format of input tensor is NCHW. Where N is batch size, " + "C is the number of input channels, " + "H is the height of the feature, and " + "W is the width of the feature."); + AddInput("ROIs", + "(Tensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]. " + "Where batch_id is the id of the data, " + "(x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates."); + AddOutput("Out", + "(Tensor), " + "The output of ROIPoolOp is a 4-D tensor with shape " + "(num_rois, channels, pooled_h, pooled_w)."); + AddOutput("Argmax", + "(Tensor), " + "Argmaxes corresponding to indices in X used " + "for gradient computation. Only output " + "if arg “is_test” is false.") + .AsIntermediate(); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "The pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "The pooled output width.") + .SetDefault(1); + AddComment(R"DOC( +ROIPool operator + +ROI Pooling for Faster-RCNN. The link below is a further introduction: +https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad, + ops::ROIPoolGradOp); +REGISTER_OP_CPU_KERNEL( + roi_pool, + ops::CPUROIPoolOpKernel, + ops::CPUROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + roi_pool_grad, + ops::CPUROIPoolGradOpKernel, + ops::CPUROIPoolOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e8fc9ec7a68cffeb45f8ece3e5bde39d1e71e92 --- /dev/null +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/roi_pool_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 5; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPUROIPoolForward(const int nthreads, const T* input_data, + const int64_t* input_rois, + const float spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, + const int pooled_width, T* output_data, + int64_t* argmax_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const int64_t* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = offset_input_rois[0]; + int roi_start_w = round(offset_input_rois[1] * spatial_scale); + int roi_start_h = round(offset_input_rois[2] * spatial_scale); + int roi_end_w = round(offset_input_rois[3] * spatial_scale); + int roi_end_h = round(offset_input_rois[4] * spatial_scale); + + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + T maxval = is_empty ? 0 : -std::numeric_limits::max(); + int maxidx = -1; + const T* offset_input_data = + input_data + (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_data_index = h * width + w; + if (offset_input_data[input_data_index] > maxval) { + maxval = offset_input_data[input_data_index]; + maxidx = input_data_index; + } + } + } + output_data[index] = maxval; + if (argmax_data) { + argmax_data[index] = maxidx; + } + } +} + +template +__global__ void GPUROIPoolBackward( + const int nthreads, const int64_t* input_rois, const T* output_grad, + const int64_t* argmax_data, const int num_rois, const float spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, T* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const int64_t* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = offset_input_rois[0]; + int input_offset = (roi_batch_ind * channels + c) * height * width; + int output_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_output_grad = output_grad + output_offset; + T* offset_input_grad = input_grad + input_offset; + const int64_t* offset_argmax_data = argmax_data + output_offset; + + int argmax = offset_argmax_data[ph * pooled_width + pw]; + if (argmax != -1) { + platform::CudaAtomicAdd( + offset_input_grad + argmax, + static_cast(offset_output_grad[ph * pooled_width + pw])); + } + } +} + +template +class GPUROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto* argmax = ctx.Output("Argmax"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + auto in_stride = framework::stride(in_dims); + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + size_t rois_num = rois->dims()[0]; + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + GPUROIPoolForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, + channels, height, width, pooled_height, pooled_width, + out->mutable_data(ctx.GetPlace()), + argmax->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* argmax = ctx.Input("Argmax"); + + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + size_t rois_num = rois->dims()[0]; + int channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (x_grad) { + x_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), x_grad, static_cast(0)); + + int output_grad_size = out_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUROIPoolBackward< + T><<>>( + output_grad_size, rois->data(), out_grad->data(), + argmax->data(), rois_num, spatial_scale, channels, height, + width, pooled_height, pooled_width, + x_grad->mutable_data(ctx.GetPlace())); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + roi_pool, + ops::GPUROIPoolOpKernel, + ops::GPUROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( + roi_pool_grad, + ops::GPUROIPoolGradOpKernel, + ops::GPUROIPoolOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15f3b36fcd16bf72b9b09f58a3019b24538eec12 --- /dev/null +++ b/paddle/fluid/operators/roi_pool_op.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class CPUROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto* argmax = ctx.Output("Argmax"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto argmax_stride = framework::stride(argmax->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + const int64_t* rois_data = rois->data(); + T* output_data = out->mutable_data(ctx.GetPlace()); + int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = rois_data[0]; + PADDLE_ENFORCE_GE(roi_batch_id, 0); + PADDLE_ENFORCE_LT(roi_batch_id, batch_size); + rois_data += roi_stride[0]; + } + + rois_data = rois->data(); + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = rois_data[0]; + int roi_start_w = round(rois_data[1] * spatial_scale); + int roi_start_h = round(rois_data[2] * spatial_scale); + int roi_end_w = round(rois_data[3] * spatial_scale); + int roi_end_h = round(rois_data[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_height = std::max(roi_end_h - roi_start_h + 1, 1); + int roi_width = std::max(roi_end_w - roi_start_w + 1, 1); + + const float bin_size_h = + static_cast(roi_height) / static_cast(pooled_height); + const float bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + const T* batch_data = input_data + roi_batch_id * in_stride[0]; + + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + // Compute pooling region for this output unit: + // start (included) = floor(ph * roi_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) + int hstart = + static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = + static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = + static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = + static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = std::min(std::max(hstart + roi_start_h, 0), height); + hend = std::min(std::max(hend + roi_start_h, 0), height); + wstart = std::min(std::max(wstart + roi_start_w, 0), width); + wend = std::min(std::max(wend + roi_start_w, 0), width); + + const int pool_index = ph * pooled_width + pw; + + // Define an empty pooling region to be zero + bool is_empty = (hend <= hstart) || (wend <= wstart); + output_data[pool_index] = + is_empty ? 0 : -std::numeric_limits::max(); + argmax_data[pool_index] = -1; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width + w; + if (batch_data[index] > output_data[pool_index]) { + output_data[pool_index] = batch_data[index]; + argmax_data[pool_index] = index; + } + } + } + } + } + + batch_data += in_stride[1]; + output_data += out_stride[1]; + argmax_data += argmax_stride[1]; + } + // Increment ROI data pointer + rois_data += roi_stride[0]; + } + return; + } +}; + +template +class CPUROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* argmax = ctx.Input("Argmax"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + + if (in_grad) { + const int64_t* rois_data = rois->data(); + const T* out_grad_data = out_grad->data(); + const int64_t* argmax_data = argmax->data(); + T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), in_grad, + static_cast(0)); + + auto in_stride = framework::stride(in->dims()); + auto argmax_stride = framework::stride(argmax->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out_grad->dims()); + + int rois_num = rois->dims()[0]; + int channels = in->dims()[1]; + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_idx = rois_data[0]; + T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0]; + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + if (argmax_data[pool_index] >= 0) { + auto index = argmax_data[pool_index]; + batch_grad_data[index] += out_grad_data[pool_index]; + } + } + } + batch_grad_data += in_stride[1]; + out_grad_data += out_stride[1]; + argmax_data += argmax_stride[1]; + } + rois_data += roi_stride[0]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..92661ea9716a89a66c27fa21543d81b5a280bcdd --- /dev/null +++ b/paddle/fluid/operators/row_conv_op.cc @@ -0,0 +1,259 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/row_conv_op.h" +#include "paddle/fluid/framework/eigen.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +class RowConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of RowConvOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto filter_dims = ctx->GetInputDim("Filter"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2."); + PADDLE_ENFORCE_EQ( + x_dims[1], filter_dims[1], + "The 2nd dimension of Input(X) and Input(Filter) should be same."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class RowConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of output(Out) should not be null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto filter_grad_name = framework::GradVarName("Filter"); + if (ctx->HasOutput(filter_grad_name)) { + auto filter_dims = ctx->GetInputDim("Filter"); + ctx->SetOutputDim(filter_grad_name, filter_dims); + } + } +}; + +class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), the input(X) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LoDTensor is a matrix with shape (T x N), where T " + "is the total time steps in this mini-batch and N is the input " + "data dimension."); + AddInput("Filter", + "(Tensor), the input(Filter) is a learnable parameter. It " + "is a 2-D tensor with shape (future_context x N), where, " + "future_context is the future context length and N is the data " + "dimension."); + AddOutput("Out", + "(LoDTensor), the output(Out) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LodTensor is a matrix with shape T x N, i.e., the " + "same shape as X."); + AddComment(R"DOC( +Row-convolution Operator. + +The row convolution is called lookahead convolution. This operator was +introduced in the following paper for DeepSpeech2: +http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf + +The main motivation is that a bidirectional RNN, useful in DeepSpeech +like speech models, learns representation for a sequence by performing a +forward and a backward pass through the entire sequence. However, unlike +unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online +and low-latency setting. The lookahead convolution incorporates information +from future subsequences in a computationally efficient manner to improve +unidirectional recurrent neural networks. The row convolution operator is +different from the 1D sequence convolution, and is computed as follows: + +Given an input sequence $in$ of length $t$ and input dimension $d$, +and a filter ($W$) of size $context \times d$, +the output sequence is convolved as: + +$$ +out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :} +$$ + +)DOC"); + } +}; + +template +class RowConvKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *out = context.Output("Out"); + + out->mutable_data(context.GetPlace()); + + auto batch_indices = x->lod()[0]; + auto input_dim = x->dims()[1]; // 'in' is of size T x N + size_t num_sequence = batch_indices.size() - 1; + + auto future_context = filter->dims()[0]; + auto weights = EigenMatrix::From(*filter); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + Tensor cur_input_sequence = + x->Slice(start, end); // Current input sequence + Tensor cur_output_sequence = + out->Slice(start, end); // Current output sequence + auto cip_seq = EigenMatrix::From(cur_input_sequence); + auto cot_seq = EigenMatrix::From(cur_output_sequence); + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + for (int d = 0; d < input_dim; d++) { + if (w == 0) { + cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d); + } else { + cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d); + } + } + } + } + } + } +}; + +template +class RowConvGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *d_out = context.Input(framework::GradVarName("Out")); + auto *dx = context.Output(framework::GradVarName("X")); + auto *d_filter = context.Output(framework::GradVarName("Filter")); + + auto input_dim = x->dims()[1]; // 'x' is of size T x N + auto batch_indices = x->lod()[0]; + size_t num_sequence = batch_indices.size() - 1; + auto future_context = filter->dims()[0]; + + if (d_filter) { + d_filter->mutable_data(context.GetPlace()); + auto dweights = + EigenMatrix::From(*d_filter); // Gradient of weight matrix + dweights.setZero(); + + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_input = x->Slice(start, end); // Current input sequence + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + + auto cur_ip = EigenMatrix::From(cur_input); + auto cur_dout = EigenMatrix::From(cur_doutput); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dweights (Updating the gradient of weight matrix) + for (int d = 0; d < input_dim; d++) { + dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d); + } + } + } + } + } + + if (dx) { + dx->mutable_data(context.GetPlace()); + auto weights = EigenMatrix::From(*filter); + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + Tensor cur_dinput = + dx->Slice(start, end); // Current input grad sequence + + auto cur_dout = EigenMatrix::From(cur_doutput); + auto cur_dip = EigenMatrix::From(cur_dinput); + cur_dip.setZero(); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dinput (Updating the gradient wrt input) + for (int d = 0; d < input_dim; d++) { + cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d); + } + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, + ops::RowConvGradOp); +REGISTER_OP_CPU_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..832072edf810099d142c82930abfd7f198a7d1b8 --- /dev/null +++ b/paddle/fluid/operators/row_conv_op.cu @@ -0,0 +1,410 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/row_conv_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Forward prop (shared memory version, for small future_context) +template +__global__ void RowConvForwardSharedMemory(const T *in, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *out) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (d < input_dim) + ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d] + : static_cast(0); + } + if (d < input_dim) { + out[(start + k) * input_dim + d] = sum; + } + } + } +} + +// Forward prop (naive version) +template +__global__ void RowConvForward(const T *in, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *out) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]); + } + out[(start + k) * input_dim + d] = sum; + } + } +} + +// Compute input gradient (shared memory version, for small future_context) +template +__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *din) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (d < input_dim) + ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d]) + : static_cast(0); + } + if (d < input_dim) { + din[(k + start) * input_dim + d] = sum; + } + } + } +} + +// Compute input gradient (Naive version) +template +__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *din) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]); + } + din[(k + start) * input_dim + d] = sum; + } + } +} + +// Compute W gradient (small future_context version) +template +__global__ void RowConvGradFilterImproved(const T *in, const T *dout, + int num_sequence, int input_dim, + int future_context, int block_x, + int block_y, + const size_t *batch_indices, + T *dfilter) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + + extern __shared__ T mem[]; + + int xdim_sh_in = block_y; + int xdim_sh_dout = block_y; + // int xdim_sh_dfilter = future_context; + int ydim_sh_in = block_x; + int ydim_sh_dout = block_x + future_context - 1; + int ydim_sh_dfilter = block_y; + + T *sh_in = mem; + T *sh_dout = &mem[xdim_sh_in * ydim_sh_in]; + T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout]; + + if (thy < future_context) { + sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * ydim_sh_in + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0); + sh_dout[thx * ydim_sh_dout + thy + future_context - 1] = + (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0); + __syncthreads(); + + if (thy < future_context - 1) { + int pos_offset = pos - future_context + 1; + sh_dout[thx * ydim_sh_dout + thy] = + (d < input_dim && pos_offset >= start) + ? dout[pos_offset * input_dim + d] + : T(0); + } + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + T val = sh_in[thy * ydim_sh_in + thx] * + sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0) { + sh_dfilter[w * ydim_sh_dfilter + thy] += val; + } + __syncthreads(); + } + } + } + for (int w = thy; (w < future_context) && (d < input_dim); w += bly) { + dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx]; + } +} + +// Compute weight(filter) gradient +template +__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, + int input_dim, int future_context, + int block_x, int block_y, + const size_t *batch_indices, T *dfilter) { + int blx = blockDim.x; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + extern __shared__ T mem[]; + T *sh_in = mem; + T *sh_dout = &mem[block_x * block_y]; + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * block_y + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0; + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + sh_dout[thx * block_y + thy] = + (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps) + ? dout[(pos - w) * input_dim + d] + : 0.0; + __syncthreads(); + + T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0 && (gx + thy) < input_dim) { + dfilter[w * input_dim + gx + thy] += val; + } + } + } + } +} + +} // namespace + +template +class RowConvKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *Out = context.Output("Out"); + + const T *in = X->data(); + const T *weight = Filter->data(); + T *out = Out->mutable_data(context.GetPlace()); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); + auto stream = context.cuda_device_context().stream(); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvForwardSharedMemory< + T><<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvForward<<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } + } +}; + +template +class RowConvGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *dOut = context.Input(framework::GradVarName("Out")); + const T *in = X->data(); + const T *weights = Filter->data(); + const T *dout = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dFilter = context.Output(framework::GradVarName("Filter")); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); + + auto &device_ctx = context.cuda_device_context(); + math::SetConstant zero; + + if (dFilter) { + T *dfilter = dFilter->mutable_data(context.GetPlace()); + zero(device_ctx, dFilter, static_cast(0.0)); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_y * block_x + block_y * (block_x + future_context - 1) + + future_context * block_y) * + sizeof(T); + RowConvGradFilterImproved< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_x * block_y * 2) * sizeof(T); // For 2 arrays of size 32x32 + RowConvGradFilter< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } + } + + if (dX) { + T *din = dX->mutable_data(context.GetPlace()); + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvGradInputSharedMemory< + T><<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvGradInput<<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/row_conv_op.h b/paddle/fluid/operators/row_conv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..59164b5215910630b4641501bc0b0c0e941911c2 --- /dev/null +++ b/paddle/fluid/operators/row_conv_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c23de9073ef965b989e98936b2dd07fc6bce7fdc --- /dev/null +++ b/paddle/fluid/operators/save_combine_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +// TODO(sidgoyal78): These function are needed by other files (save_op), move +// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). +constexpr char kSEP = '/'; +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + if (mkdir(path, 0755)) { + PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); + } +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} + +class SaveCombineOp : public framework::OperatorBase { + public: + SaveCombineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + auto overwrite = Attr("overwrite"); + + bool is_present = FileExists(filename); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto inp_var_names = Inputs("X"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < inp_var_names.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + + PADDLE_ENFORCE(var != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(var->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + auto &tensor = var->Get(); + // Serialize tensor + framework::SerializeToStream(fout, tensor, dev_ctx); + } + fout.close(); + } +}; + +class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(vector) Input LoDTensors that need to be saved together in a file.") + .AsDuplicable(); + AddComment(R"DOC( +SaveCombine operator + +This operator will serialize and write a list of input LoDTensor variables +to a file on disk. +)DOC"); + AddAttr("overwrite", + "(boolean, default true)" + "Overwrite the output file if it exists.") + .SetDefault(true); + AddAttr( + "file_path", + "(string)" + "The \"file_path\" where the LoDTensor variables will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(save_combine, ops::SaveCombineOp, + ops::SaveCombineOpProtoMaker); diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8325bac6bc59602e81d38cb857b7b8e133be2cc --- /dev/null +++ b/paddle/fluid/operators/save_load_combine_op_test.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" + +USE_NO_KERNEL_OP(save_combine); +USE_NO_KERNEL_OP(load_combine); + +int* CreateForSaveCombineOp(int x, int y, const std::vector& lod_info, + std::string var_name, + paddle::platform::CPUPlace& place, + paddle::framework::Scope& scope, + paddle::framework::LoD& expect_lod) { + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({x, y}); + expect_lod.resize(1); + for (size_t i = 0; i < lod_info.size(); i++) { + expect_lod[0].push_back(lod_info[i]); + } + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + return expect; +} + +paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad( + const std::string out_var_name, paddle::framework::Scope& scope) { + auto load_var = scope.Var(out_var_name); + auto target = load_var->GetMutable(); + return target; +} + +int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target, + paddle::framework::Scope& scope, + paddle::framework::LoD& actual_lod) { + int* actual = target->data(); + actual_lod = target->lod(); + return actual; +} + +void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod, + paddle::framework::LoD actual_lod, const int& numel) { + for (int64_t i = 0; i < numel; ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} + +// Here, we create 4 LoDTensors and use save_combine_op to first save these +// in a single file. Then, we use load_combine_op to load these sequentially +TEST(SaveLoadCombineOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + std::vector lod1 = {0, 1, 2, 3, 10}; + int numel1 = 100; + paddle::framework::LoD expect_lod1; + int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope, + expect_lod1); + + std::vector lod2 = {0, 2, 5, 10}; + int numel2 = 200; + paddle::framework::LoD expect_lod2; + int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope, + expect_lod2); + + std::vector lod3 = {0, 2, 3, 20}; + int numel3 = 4000; + paddle::framework::LoD expect_lod3; + int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place, + scope, expect_lod3); + + std::vector lod4 = {0, 1, 20}; + int numel4 = 1000; + paddle::framework::LoD expect_lod4; + int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope, + expect_lod4); + + // Set attributes + std::string filename = "check_tensor.ls"; + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string(filename)}); + + // Run the save_combine_op + auto save_combine_op = paddle::framework::OpRegistry::CreateOp( + "save_combine", + {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs); + save_combine_op->Run(scope, place); + + // Set up output vars + auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope); + auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope); + auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope); + auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope); + + // Run the load_combine_op + auto load_combine_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, + {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs); + load_combine_op->Run(scope, place); + + paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4; + int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1); + int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2); + int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3); + int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4); + + CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1); + CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2); + CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3); + CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4); +} + +// Test with original SaveLoadTest +TEST(SaveLoadTestWithCombineOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("check_t.save")}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save_combine", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + int* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + auto& actual_lod = target->lod(); + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..da4573a8ed936cf607123590ca41fb8f630930f3 --- /dev/null +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" + +USE_NO_KERNEL_OP(save); +USE_NO_KERNEL_OP(load); + +TEST(SaveLoadOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("tensor.save")}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + int* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + auto& actual_lod = target->lod(); + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..483cdfa4c3b9e3b9abd3f32bc5e6e5e0b493bd23 --- /dev/null +++ b/paddle/fluid/operators/save_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +// TODO(yuyang18): If the functions below are needed by other files, move them +// to paddle::filesystem namespace. +constexpr char kSEP = '/'; +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + if (mkdir(path, 0755)) { + PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); + } +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} + +class SaveOp : public framework::OperatorBase { + public: + SaveOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + auto overwrite = Attr("overwrite"); + + if (FileExists(filename) && !overwrite) { + PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto iname = Input("X"); + auto *var = scope.FindVar(iname); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", + iname); + + PADDLE_ENFORCE(var->IsType(), + "SaveOp only support LoDTensor, %s has wrong type", iname); + + auto &tensor = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::SerializeToStream(fout, tensor, dev_ctx); + } +}; + +class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor ) Input tensor to be saved"); + AddComment(R"DOC( +Save operator + +This operator will serialize and write a tensor variable to file on disk. +)DOC"); + AddAttr("overwrite", + "(boolean, default true)" + "Overwrite the output file if exist") + .SetDefault(true); + AddAttr("file_path", + "(string)" + "The \"file_path\" where the variable will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..017fc2c00e4016052179acfe328cdda42d6f84de --- /dev/null +++ b/paddle/fluid/operators/scale_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +class ScaleOp : public framework::OperatorWithKernel { + public: + ScaleOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ScaleOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ScaleOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor of scale operator."); + AddOutput("Out", "(Tensor) Output tensor of scale operator."); + AddComment(R"DOC( +Scale operator + +$$Out = scale*X$$ +)DOC"); + AddAttr("scale", + "(float, default 1.0)" + "The scaling factor of the scale operator.") + .SetDefault(1.0); + } +}; + +class ScaleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("scale"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("scale", GetAttr("scale")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, + ops::ScaleGradMaker); +REGISTER_OP_CPU_KERNEL( + scale, ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a9b46077aa07406fef2cba5b18d190501ce2f92a --- /dev/null +++ b/paddle/fluid/operators/scale_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scale_op.h" + +REGISTER_OP_CUDA_KERNEL( + scale, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b1c2964ca6385dc6fb81f61a3e5bb042f5d7019f --- /dev/null +++ b/paddle/fluid/operators/scale_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class ScaleKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* tensor = context.Output("Out"); + auto* in = context.Input("X"); + tensor->mutable_data(in->place()); + + auto scale = static_cast(context.Attr("scale")); + + auto eigen_out = framework::EigenVector::Flatten(*tensor); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto& dev = + *context.template device_context().eigen_device(); + eigen_out.device(dev) = scale * eigen_in; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..0f1b9426a745ac293bd756da6ee750119879429e --- /dev/null +++ b/paddle/fluid/operators/scatter.cu.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void ScatterCUDAKernel(const T* params, const int* indices, + T* output, size_t index_size, + size_t slice_size) { + CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + int scatter_i = indices[indices_i]; + int out_i = scatter_i * slice_size + slice_i; + *(output + out_i) = *(params + i); + } +} + +/** + * A thin wrapper on gpu tensor + * Return a new updated tensor from source tensor, scatter-assigned according to + * index + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + // PADDLE_ENFORCE(platform::is_gpu_place(place)); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + framework::DDim output_dims(src_dims); + output_dims[0] = index_size; + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + int block = 512; + int n = slice_size * index_size; + int grid = (n + block - 1) / block; + + ScatterCUDAKernel<<< + grid, block, 0, + reinterpret_cast(ctx).stream()>>>( + p_src, p_index, p_output, index_size, slice_size); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h new file mode 100644 index 0000000000000000000000000000000000000000..70cae1286caf10323e8e424853f1dc14f84b110c --- /dev/null +++ b/paddle/fluid/operators/scatter.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +/** + * Return a updated tensor from source tensor, scattered according to index: + * dst[i] = src[index[i]] + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + auto dst_dims = output->dims(); + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + // check src shape and dst shape should match + for (int i = 1; i < src_dims.size(); i++) + PADDLE_ENFORCE(src_dims[i] == dst_dims[i]); + + // slice size + size_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + + for (int i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e35930af53463e18e5ecca3cf41b91ed58a7c4c2 --- /dev/null +++ b/paddle/fluid/operators/scatter_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scatter_op.h" +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class ScatterOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ref"), + "Input(Ref) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Index"), + "Input(Index) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Updates"), + "Input(Updates) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ScatterOp should not be null."); + + auto updates_dims = ctx->GetInputDim("Updates"); + auto ref_dims = ctx->GetInputDim("Ref"); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1, + "Update Index should be 1-D."); + PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(), + "Reference and Updates should have the same shape size"); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], + ctx->GetInputDim("Index")[0], + "Updates and Index should have same batch-size."); + framework::DDim data_dim(updates_dims); + for (int i = 1; i < data_dim.size(); ++i) { + PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]); + } + ctx->SetOutputDim("Out", ref_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Ref")->type()), + ctx.device_context()); + } +}; + +class ScatterGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Updates"), + ctx->GetInputDim("Updates")); + ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Ref")->type()), + ctx.device_context()); + } +}; + +class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ref", "The source input of scatter op"); + AddInput("Index", + "The index input of scatter op where Ref will be updated"); + AddInput("Updates", "The updated value of updates op"); + AddOutput("Out", "The output of add op"); + AddComment(R"DOC( +Scatter Operator. + +This operator obtains output by updating the input on selected indices on the first axis: + +$$ +Out = Ref \\ +Out[Index] = Ref[Index] + Updates +$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad, + ops::ScatterGradOp); +REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel); +REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f9eaae33a802ed1a45184a24757e3883fad5e639 --- /dev/null +++ b/paddle/fluid/operators/scatter_op.cu @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gather.cu.h" +#include "paddle/fluid/operators/gather_op.h" +#include "scatter.cu.h" + +namespace paddle { +namespace operators { + +template +class ScatterOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *Ref = ctx.Input("Ref"); + auto *Index = ctx.Input("Index"); + auto *Updates = ctx.Input("Updates"); + auto *Out = ctx.Output("Out"); + + Out->ShareDataWith(*Ref); + + GPUScatterAssign(ctx.device_context(), *Updates, *Index, Out); + } +}; + +template +class ScatterGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto *Index = ctx.Input("Index"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + + // In place gradient: dRef = dO + dRef->ShareDataWith(*dOut); + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates = dO[Index] + GPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h new file mode 100644 index 0000000000000000000000000000000000000000..65d10546328780e09bb57876acf2326d98803847 --- /dev/null +++ b/paddle/fluid/operators/scatter_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "gather.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ScatterOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + auto *Ref = ctx.Input("Ref"); + auto *Index = ctx.Input("Index"); + auto *Updates = ctx.Input("Updates"); + auto *Out = ctx.Output("Out"); + + // In place output: Out = Ref, Out[Index] += Updates + Out->ShareDataWith(*Ref); + // Apply ScatterUpdate: Out[index] += Updates[:] + ScatterAssign(ctx.device_context(), *Updates, *Index, Out); + } +}; + +template +class ScatterGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto *Index = ctx.Input("Index"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + + // In place gradient: dRef = dO + dRef->ShareDataWith(*dOut); + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates += dO[Index] + CPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8fb5ef96af34e5bd2dc0802ea76456a8b47749ab --- /dev/null +++ b/paddle/fluid/operators/scatter_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scatter.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#include + +TEST(scatter, ScatterUpdate) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators; + + Tensor* src = new Tensor(); + Tensor* index = new Tensor(); + Tensor* output = new Tensor(); + + float* p_src = nullptr; + int* p_index = nullptr; + p_src = src->mutable_data(make_ddim({1, 4}), CPUPlace()); + p_index = index->mutable_data(make_ddim({1}), CPUPlace()); + + for (size_t i = 0; i < 4; ++i) p_src[i] = float(i); + p_index[0] = 1; + + float* p_output = output->mutable_data(make_ddim({4, 4}), CPUPlace()); + + auto* cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext ctx(*cpu_place); + ScatterAssign(ctx, *src, *index, output); + + for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0)); + for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data()[i], float(0)); + for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4)); + for (size_t i = 4; i < 8; ++i) + EXPECT_EQ(output->data()[i], float(i - 4)); + for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0)); + for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data()[i], float(0)); + + delete src; + delete index; + delete output; +} diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a8390aa6596c69f85e3ef736dda9dd99c3fd6dba --- /dev/null +++ b/paddle/fluid/operators/send_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +namespace paddle { +namespace operators { + +class SendOp : public framework::OperatorBase { + public: + SendOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + auto ins = Inputs("X"); + auto outs = Outputs("Out"); + std::vector epmap = Attr>("epmap"); + std::vector endpoints = + Attr>("endpoints"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + auto client_var_name = Output("RPCClient"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), + "Can not find variable '%s' in the scope.", + client_var_name); + auto* client_var = scope.FindVar(client_var_name); + detail::RPCClient* rpc_client = client_var->GetMutable(); + + for (size_t i = 0; i < ins.size(); i++) { + VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); + } + PADDLE_ENFORCE(rpc_client->Wait()); + + for (auto& ep : endpoints) { + VLOG(3) << "batch barrier, ep: " << ep; + rpc_client->AsyncSendBatchBarrier(ep); + } + PADDLE_ENFORCE(rpc_client->Wait()); + + if (outs.size() > 0) { + for (size_t i = 0; i < outs.size(); i++) { + VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; + rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + } + PADDLE_ENFORCE(rpc_client->Wait()); + } + } +}; + +class SendOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SendOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); + AddOutput("Out", "(Tensor) Output tensor to be received from server") + .AsDuplicable(); + AddOutput("RPCClient", + "(RPCClient) The RPC client object which is" + "initialized at most once."); + AddComment(R"DOC( +Send operator + +This operator will send tensor to recv_op at the parameter server. +)DOC"); + // TODO(typhoonzero): remove this attr generate de-duplicated vector from + // epmap when initializing. + AddAttr>("endpoints", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints to send variables to.") + .SetDefault({}); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker); diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..716f687044a85d46676141ee125baf398e9e695d --- /dev/null +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/string/printf.h" + +USE_NO_KERNEL_OP(send); +USE_NO_KERNEL_OP(listen_and_serv); +USE_OP(sum); + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +// global for simplicity. +std::unique_ptr listen_and_serv_op; + +void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { + p::CPUDeviceContext ctx(place); + for (int i = 0; i < 2; ++i) { + auto var_name = paddle::string::Sprintf("x%d", i); + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + float *expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + } + + auto out_var = scope.Var("Out"); + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({10, 10}); + out_tensor->mutable_data(place); // allocate +} + +void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { + p::CPUDeviceContext ctx(place); + int64_t height = 10; + int64_t row_numel = 10; + m::SetConstant set_one; + // init x0 + std::vector rows0{0, 4, 7}; + auto x0_var = scope.Var("x0"); + auto x0 = x0_var->GetMutable(); + x0->set_rows(rows0); + x0->set_height(height); + auto x0_value = x0->mutable_value(); + x0_value->mutable_data( + f::make_ddim({static_cast(rows0.size()), row_numel}), place); + set_one(ctx, x0_value, 1.0); + + // init x1 + std::vector rows1{2, 9}; + auto x1_var = scope.Var("x1"); + auto x1 = x1_var->GetMutable(); + x1->set_rows(rows1); + x1->set_height(height); + auto x1_value = x1->mutable_value(); + x1_value->mutable_data( + f::make_ddim({static_cast(rows1.size()), row_numel}), place); + set_one(ctx, x1_value, 1.0); + + auto out_var = scope.Var("Out"); + auto out = out_var->GetMutable(); + auto out_value = out->mutable_value(); + out->set_height(height); + out_value->mutable_data(f::make_ddim({5, 10}), place); +} + +void AddOp(const std::string &type, const f::VariableNameMap &inputs, + const f::VariableNameMap &outputs, f::AttributeMap attrs, + f::BlockDesc *block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->Var(v); + var->SetDataType(f::proto::DataType::FP32); + } + } + + // insert op + auto op = block->AppendOp(); + op->SetType(type); + for (auto &kv : inputs) { + op->SetInput(kv.first, kv.second); + } + for (auto &kv : outputs) { + op->SetOutput(kv.first, kv.second); + } + op->SetAttrMap(attrs); +} + +void StartServerNet(bool is_sparse) { + f::Scope scope; + p::CPUPlace place; + if (is_sparse) { + InitSelectedRowsInScope(scope, place); + } else { + InitTensorsInScope(scope, place); + } + + // sub program run in listen_and_serv_op, for simple test we use sum + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + // X for server side tensors, RX for received tensers, must be of same shape. + AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block); + + f::AttributeMap attrs; + attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); + attrs.insert({"ParamList", std::vector({"Out"})}); + attrs.insert({"GradList", std::vector({"x1"})}); + attrs.insert({"OptimizeBlock", block}); + listen_and_serv_op = + f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs); + listen_and_serv_op->Run(scope, place); +} + +TEST(SendRecvOp, CPUDense) { + std::thread server_thread(StartServerNet, false); + sleep(10); // wait server to start + // local net + f::Scope scope; + p::CPUPlace place; + InitTensorsInScope(scope, place); + + f::AttributeMap attrs; + attrs.insert({"endpoints", std::vector({"127.0.0.1:6174"})}); + attrs.insert({"epmap", std::vector({"127.0.0.1:6174"})}); + auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}}, + {{"Out", {"Out"}}}, attrs); + send_op->Run(scope, place); + + auto in_var = scope.Var("x1"); + auto tensor = in_var->GetMutable(); + float *expected = tensor->data(); + auto out_var = scope.Var("Out"); + auto target = out_var->GetMutable(); + // x1 * 2 == x0 + EXPECT_NE(target->memory_size(), size_t(0)); + float *actual = target->data(); + for (int64_t i = 0; i < target->numel(); ++i) { + EXPECT_EQ(expected[i] * 2, actual[i]); + } + listen_and_serv_op->Stop(); + server_thread.join(); + listen_and_serv_op.reset(nullptr); +} + +TEST(SendRecvOp, CPUSparse) { + std::thread server_thread(StartServerNet, true); + sleep(3); // wait server to start + // local net + f::Scope scope; + p::CPUPlace place; + p::CPUDeviceContext ctx(place); + InitSelectedRowsInScope(scope, place); + f::AttributeMap attrs; + attrs.insert({"endpoints", std::vector({"127.0.0.1:6174"})}); + attrs.insert({"epmap", std::vector({"127.0.0.1:6174"})}); + auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}}, + {{"Out", {"Out"}}}, attrs); + send_op->Run(scope, place); + + auto x0 = scope.Var("x0")->GetMutable(); + auto x1 = scope.Var("x1")->GetMutable(); + auto out = scope.Var("Out")->GetMutable(); + auto actual = out->mutable_value(); + + std::unique_ptr expect{new f::SelectedRows()}; + auto expect_value = expect->mutable_value(); + expect_value->mutable_data(f::make_ddim({5, 10}), place); + + m::SelectedRowsAdd add_functor; + add_functor(ctx, *x0, *x1, expect.get()); + + EXPECT_EQ(actual->numel(), expect_value->numel()); + EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size()); + + for (int64_t i = 0; i < expect_value->numel(); ++i) { + EXPECT_EQ(expect_value->mutable_data(place)[i], + actual->mutable_data(place)[i]); + } + listen_and_serv_op->Stop(); + server_thread.join(); + listen_and_serv_op.reset(); +} diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ddf800d85e11aa631255c1b1ec5c12f6e0f221c --- /dev/null +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_concat_op.h" + +namespace paddle { +namespace operators { + +class SequenceConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("X"), + "Inputs(X) of SequenceConcatOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceConcatOp should not be null."); + const size_t level = static_cast(ctx->Attrs().Get("level")); + const size_t axis = static_cast(ctx->Attrs().Get("axis")); + PADDLE_ENFORCE(level == 0UL || level == 1UL, + "The sequence_concat operator only accepts sequence " + "or a nested sequence as its input."); + auto ins_dims = ctx->GetInputsDim("X"); + framework::DDim out_dims = ins_dims[0]; + const size_t n = ins_dims.size(); + for (size_t i = 1; i < n; ++i) { + out_dims[axis] += ins_dims[i][axis]; + } + ctx->SetOutputDim("Out", out_dims); + } +}; + +class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LodTensorArray) Input is a vector of LoDTensor, " + "each of which is a variable-length sequence or nested sequence.") + .AsDuplicable(); + AddOutput("Out", + "(LoDTensor), Variable-length output of " + "sequence_concat Op."); + AddAttr("axis", + "(int, default 0) " + "The axis along which the inputs will be joined. " + "If axis is 0, the inputs will be joined with LoD index.") + .SetDefault(0); + AddAttr("level", + "(int, default 0) " + "The level at which the inputs will be joined. " + "If the level is 0, the inputs will be joined at the nested " + "sequence level. " + "If the level is 1, the inputs will be joined at the " + "sequence level. " + "The level should be less than the level number of inputs.") + .SetDefault(0); + AddComment(R"DOC( +The sequence_concat operator concatenates multiple LoDTensors. +It only supports sequence (LoD Tensor with level number is 1) +or a nested sequence (LoD tensor with level number is 2) as its input. +- Case1: + If the axis is other than 0(here, axis is 1 and level is 1), + each input should have the same LoD information and the LoD + information of the output keeps the same as the input. + + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4) + LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) + +- Case2: + If the axis is 0(here, leve is 0), the inputs are concatenated along + time steps, the LoD information of the output need to re-compute. + The LoD information of level-1 should be same. + + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4) + +- Case3: + If the axis is 0(here, level is 1). + + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4) + +- Case4: + If the LoD number is 1, axis is 0, level is 0 + + LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4) + +NOTE: The levels of all the inputs should be the same. + )DOC"); + } +}; + +class SequenceConcatGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp, + ops::SequenceConcatOpMaker, sequence_concat_grad, + ops::SequenceConcatGradOp, false); +REGISTER_OP_CPU_KERNEL( + sequence_concat, + ops::SequenceConcatOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_concat_grad, + ops::SequenceConcatGradOpKernel); diff --git a/paddle/fluid/operators/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_concat_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5a280ef9e2515114f5dd6826e55a304066973aa --- /dev/null +++ b/paddle/fluid/operators/sequence_concat_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_concat_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_concat, + ops::SequenceConcatOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, + ops::SequenceConcatGradOpKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9121196369f1bee20abc56a33b9da8bc4a43f315 --- /dev/null +++ b/paddle/fluid/operators/sequence_concat_op.h @@ -0,0 +1,172 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +LoD ConcatLoD(const std::vector ins, const size_t level) { + auto out_lod = ins[0]->lod(); + auto numLevels = ins[0]->NumLevels(); + const size_t n = ins.size(); + const size_t level_idx = ins[0]->NumLevels() - 1 - level; + for (size_t i = 1; i < n; ++i) { + for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) { + out_lod[level_idx][j] += ins[i]->lod()[level_idx][j]; + } + } + + for (size_t i = level_idx; i < numLevels - 1; ++i) { + size_t lod_len = 1; + for (size_t j = 0; j < n; ++j) { + lod_len += ins[j]->lod()[i + 1].size() - 1; + } + out_lod[i + 1].clear(); + out_lod[i + 1].resize(lod_len); + + size_t idx = 1; + for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) { + for (size_t k = 0; k < n; ++k) { + for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) { + out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] + + ins[k]->lod()[i + 1][m + 1] - + ins[k]->lod()[i + 1][m]; + idx++; + } + } + } + } + + return out_lod; +} + +template +class SequenceConcatOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + const size_t axis = static_cast(ctx.Attr("axis")); + const size_t level = static_cast(ctx.Attr("level")); + const size_t n = ins.size(); + + for (size_t i = 1; i < n; ++i) { + PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(), + "The levels of all the input LoDTensors " + "should be the same."); + PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(), + "The dimension size of all the input LoDTensors " + "should be the same."); + + const size_t dims_size = ins[i]->dims().size(); + for (size_t j = 0; j < dims_size; ++j) { + if (j == axis) continue; + PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j], + "Except for the dimension of the specified " + "axis along which all the inputs are concatenated, " + "dimensions of all the other axises of the input " + "LoDTensors should be the same."); + } + } + PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level, + "The levels of all the input LoDTensors " + "should be greater than the specify level"); + + out->mutable_data(ctx.GetPlace()); + auto out_lod = ins[0]->lod(); + if (axis == 0) { + out_lod = ConcatLoD(ins, level); + } + out->set_lod(out_lod); + + const size_t level_idx = out_lod.size() - level - 1; + auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; + for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { + Tensor out_t = out->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); + auto out_stride = framework::stride(out_t.dims()); + size_t offset = 0; + for (size_t j = 0; j < n; ++j) { + auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx]; + auto in_stride = framework::stride(ins[j]->dims()); + Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), + static_cast(in_lod_level[i + 1])); + size_t axis_dim = in_t.dims()[axis]; + StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, + in_t.dims(), out_stride, out_t.data() + offset); + offset += axis_dim * in_stride[axis]; + } + } + } +}; + +template +class SequenceConcatGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto x_grads = + ctx.MultiOutput(framework::GradVarName("X")); + size_t axis = static_cast(ctx.Attr("axis")); + size_t level = static_cast(ctx.Attr("level")); + const size_t n = x_grads.size(); + + // Set Grad(X) LoD as X + for (size_t i = 0; i < n; i++) { + x_grads[i]->set_lod(ins[i]->lod()); + x_grads[i]->mutable_data(ctx.GetPlace()); + } + auto out_lod = ins[0]->lod(); + if (axis == 0UL) { + out_lod = ConcatLoD(ins, level); + } + const size_t level_idx = out_lod.size() - level - 1; + auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; + + for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { + Tensor out_grad_t = + out_grad->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); + auto out_grad_stride = framework::stride(out_grad_t.dims()); + size_t offset = 0; + + for (size_t j = 0; j < n; ++j) { + auto x_grad_lod_level = + framework::ToAbsOffset(x_grads[j]->lod())[level_idx]; + auto x_grad_stride = framework::stride(x_grads[j]->dims()); + Tensor x_grad_t = + x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), + static_cast(x_grad_lod_level[i + 1])); + size_t axis_dim = x_grad_t.dims()[axis]; + StridedMemcpy(ctx.device_context(), out_grad_t.data() + offset, + out_grad_stride, out_grad_t.dims(), x_grad_stride, + x_grad_t.data()); + offset += axis_dim * out_grad_stride[axis]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..af9938b18069d65648fbbf0deae31eff088b791f --- /dev/null +++ b/paddle/fluid/operators/sequence_conv_op.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_conv_op.h" + +namespace paddle { +namespace operators { + +class SequenceConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of SequenceConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceConvOp should not be null."); + + int context_length = ctx->Attrs().Get("contextLength"); + int context_start = ctx->Attrs().Get("contextStart"); + + auto in_dims = ctx->GetInputDim("X"); + auto filter_dims = ctx->GetInputDim("Filter"); + PADDLE_ENFORCE(ctx->Attrs().Get("contextStride") == 1, + "Currently, SequenceConvOp only supports contextStride=1."); + PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2, + "Input(X, Filter) should be 2-D tensor."); + PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1], + "Filter's height should be context_length * " + "input_hidden_size ."); + + if (ctx->Attrs().Get("paddingTrainable")) { + PADDLE_ENFORCE( + ctx->HasInput("PaddingData"), + "Input(PaddingData) of SequenceConvOp should not be null."); + framework::DDim padding_dim = ctx->GetInputDim("PaddingData"); + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + int total_pad = up_pad + down_pad; + int input_width = static_cast(in_dims[1]); + + if (context_start == 0 && context_length == 1) { + PADDLE_THROW( + "If context_start is 0 and context_length is 1, paddingTrainable " + "should be false."); + } + PADDLE_ENFORCE(padding_dim.size() == 2, + "Input(PaddingData) should be 2-D tensor."); + PADDLE_ENFORCE( + padding_dim[0] == total_pad && padding_dim[1] == input_width, + "Input(PaddingData)'s shape is not consistent with 'context_start' " + "and 'context_length'."); + } + + in_dims[1] = filter_dims[1]; + ctx->SetOutputDim("Out", in_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class SequenceConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of output(Out) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null."); + + if (ctx->Attrs().Get("paddingTrainable") && + ctx->HasOutput(framework::GradVarName("PaddingData"))) { + ctx->SetOutputDim(framework::GradVarName("PaddingData"), + ctx->GetInputDim("PaddingData")); + } + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), + ctx->GetInputDim("Filter")); + } + } +}; + +class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(LoDTensor) the input(X) is a LodTensor, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T, N), where T is the " + "total time steps in this mini-batch and N is the input_hidden_size."); + AddInput("PaddingData", + "(Tensor, optional) the input(PaddingData) is an optional " + "parameter, and it is learnable. " + "This is a tensor with shape (P, N), where P is the " + "top_pad + bottom_pad, N is the input_hidden_size. In order to " + "ensure the equal length of sequence before and after " + "convolution, it is necessary to fill the top and bottom of each " + "sequence according to context_length, context_stride and " + "context_start") + .AsDispensable(); + AddInput( + "Filter", + "(Tensor) the input(Filter) is an learnable parameter." + "This is a tensor with shape (K, M), where K is the " + "context_length * input_hidden_size, M is the output feature size."); + AddOutput( + "Out", + "(LoDTensor) the output(Out) is a LodTensor, which support " + "variable-time length output sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T, M), where, T is the " + "total time steps in this mini-batch, M is the output feature size."); + + AddAttr("paddingTrainable", + "(bool, default:false) the padding data of SequenceConvOp " + "is trainable or not.") + .SetDefault(false); + AddAttr("contextLength", + "(int) the contextLength of SequenceConvOp is the " + "height of the convolution kernel.") + .GreaterThan(0); + AddAttr("contextStart", + "(int, default:0) the contextStart of SequenceConvOp " + "represents the beginning of the convolution of the number of " + "rows of sequence, which can be negative. The negative number " + "means to pad contextStart time-steps of zeros or learnable " + "parameters at the beginning of each instance. The positive " + "number means to skip contextStart time-steps of each " + "instance.") + .SetDefault(0); + AddAttr("contextStride", + "(int, default:1) the contextStride of SequenceConvOp " + "represents the stride length of convolution kernel. " + "Currently, SequenceConvOp only supports" + "contextStride=1.") + .SetDefault(1) + .GreaterThan(0); + + AddComment(R"DOC( +Sequence Conv Operator. + +SequenceConvOp performs convolution operation on features of contextLength +time-steps of each instance. The convolution operation calculates the output +based on the input, filter, strides and paddings parameters. +The size of each dimension of the parameters is checked during infer-shape. +In order to ensure the equal length of sequence before and after convolution, +it is necessary to fill the top and bottom of each sequence based on +context_length, context_stride and context_start. + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, + sequence_conv_grad, ops::SequenceConvGradOp); + +REGISTER_OP_CPU_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CPU_KERNEL( + sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_conv_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..36f9e8da95d8c963c74fb6c8e75c777b7ba03095 --- /dev/null +++ b/paddle/fluid/operators/sequence_conv_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_conv_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c81067fea2370458cf6abe8e5465b4c674fbf09 --- /dev/null +++ b/paddle/fluid/operators/sequence_conv_op.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/context_project.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + auto filter = *context.Input("Filter"); + + out->mutable_data(context.GetPlace()); + context.ShareLoD("X", "Out"); + + int context_start = context.Attr("contextStart"); + int context_length = context.Attr("contextLength"); + int context_stride = context.Attr("contextStride"); + bool padding_trainable = context.Attr("paddingTrainable"); + + PADDLE_ENFORCE_EQ(in->lod().size(), 1UL, + "Only support one level sequence now."); + + const Tensor* padding_data = nullptr; + if (padding_trainable) { + padding_data = context.Input("PaddingData"); + } + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + int sequence_width = static_cast(in->dims()[1]); + + framework::DDim col_shape = {in->dims()[0], + context_length * sequence_width}; + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // Because if padding_trainable is false, padding data should be zeros. + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, &col, static_cast(0)); + + math::ContextProjectFunctor seq_project_functor; + + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); + + math::matmul(dev_ctx, col, false, filter, false, + static_cast(1.0), out, + static_cast(0.0)); + } +}; + +template +class SequenceConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in_g = context.Output(framework::GradVarName("X")); + auto* out_g = context.Input(framework::GradVarName("Out")); + auto* filter_g = context.Output(framework::GradVarName("Filter")); + auto* padding_data_g = + context.Output(framework::GradVarName("PaddingData")); + auto* in = context.Input("X"); + auto* filter = context.Input("Filter"); + + int context_start = context.Attr("contextStart"); + int context_length = context.Attr("contextLength"); + int context_stride = context.Attr("contextStride"); + bool padding_trainable = context.Attr("paddingTrainable"); + + PADDLE_ENFORCE_EQ(in->lod().size(), 1UL, + "Only support one level sequence now."); + auto lod_g_level_0 = in->lod()[0]; + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + int sequence_width = static_cast(in->dims()[1]); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + // use col_shape in the im2col calculation + framework::DDim col_shape = {in->dims()[0], + sequence_width * context_length}; + Tensor col; + + if (in_g || filter_g || (padding_trainable && padding_data_g)) { + col.mutable_data(col_shape, context.GetPlace()); + // Because if padding_trainable is false, padding data should be zeros. + set_zero(dev_ctx, &col, static_cast(0)); + math::matmul(dev_ctx, *out_g, false, *filter, true, + T(1.0), &col, T(1.0)); + } + math::ContextProjectFunctor seq_project_functor; + math::ContextProjectGradFunctor seq_project_grad_functor; + + if (in_g) { + in_g->mutable_data(context.GetPlace()); + in_g->set_lod(in->lod()); + set_zero(dev_ctx, in_g, static_cast(0)); + + seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start, + context_length, context_stride, up_pad, down_pad, + false, true, padding_data_g, &col); + } + + if (padding_trainable && padding_data_g) { + padding_data_g->mutable_data(context.GetPlace()); + set_zero(dev_ctx, padding_data_g, static_cast(0)); + + LoDTensor* input = const_cast(in); + seq_project_grad_functor( + dev_ctx, *input, padding_trainable, context_start, context_length, + context_stride, up_pad, down_pad, true, false, padding_data_g, &col); + } + + if (filter_g) { + filter_g->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_g, static_cast(0)); + + Tensor filter_grad = *filter_g; + LoDTensor out_grad = *out_g; + + const Tensor* padding_data = nullptr; + if (padding_trainable) { + padding_data = context.Input("PaddingData"); + } + + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); + + math::matmul(dev_ctx, col, true, out_grad, false, + T(1.0), &filter_grad, T(1.0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e0adf8b1900f7b7c43001459a7e7c494d854274 --- /dev/null +++ b/paddle/fluid/operators/sequence_erase_op.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_erase_op.h" + +namespace paddle { +namespace operators { + +class SequenceEraseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceEraseOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceEraseOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1, + "Input(X) of SequenceEraseOp should be a 2-D LoDTensor " + "with the 2nd dimension equal to 1."); + ctx->SetOutputDim("Out", x_dims); + } +}; + +class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(2-D LoDTensor with the 2nd dim. equal to 1) " + "Input LoDTensor of SequenceEraseOp."); + AddOutput("Out", + "(2-D LoDTensor with the 2nd dim. equal to 1) " + "Output LoDTensor of SequenceEraseOp."); + AddAttr>("tokens", + "(vector) Tokens need to be erased from " + "input sequences."); + AddComment(R"DOC( +Sequence Erase Operator. + +Sequence erase operator erases tokens specified by Attr(tokens) from the input +sequences Input(X), and outputs the remaining data and modifies the LoD +information at the same time. For example, given a 2-D LoDTensor + + X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T + +with lod = [[0, 3, 6, 10]], there are three sequences in the input: + + X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T. + +If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing +operation, the three sequences become + + X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T. + +Hence the LoDTensor Output(Out) should be + + Out = [[6, 1, 9, 6, 1, 0, 1]]^T, + +with lod = [[0, 1, 3, 7]]. + +An example usage for this operator is to remove the special tokens when +computing the edit distance between two strings, such as blank, start token, +and end token. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp, + ops::SequenceEraseOpMaker); +REGISTER_OP_CPU_KERNEL( + sequence_erase, + ops::SequenceEraseKernel, + ops::SequenceEraseKernel); diff --git a/paddle/fluid/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..43fc352fe78d03fb54dd90a43e3d37b0646cefce --- /dev/null +++ b/paddle/fluid/operators/sequence_erase_op.cu @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/operators/sequence_erase_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; +using LoDTensor = framework::LoDTensor; + +template +__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len, + const int* tokens, const size_t tokens_len, + size_t* num_erased) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < in_len) { + for (size_t i = 0; i < tokens_len; ++i) { + if (in_dat[index] == tokens[i]) { + num_erased[index + 1] = 1; + break; + } + } + } +} + +__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod, + const size_t lod_len, size_t* out_lod0) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < lod_len) { + out_lod0[index] = in_lod[index] - num_erased[in_lod[index]]; + } +} + +template +__global__ void SetOutput(const T* in_dat, const int64_t in_len, + const size_t* num_erased, T* out_dat) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < in_len) { + if (num_erased[index] == num_erased[index + 1]) { + out_dat[index - num_erased[index]] = in_dat[index]; + } + } +} + +template +class SequenceEraseOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = in->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(), + "The actual size mismatches with the LoD information."); + auto tokens = ctx.Attr>("tokens"); + auto in_len = in->numel(); + auto in_dat = in->data(); + // Copy tokens to GPU + thrust::device_vector dev_tokens(tokens.begin(), tokens.end()); + int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data()); + + // Count number of elements to be erased + thrust::device_vector num_erased(in_len + 1, 0); + size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data()); + auto stream = ctx.cuda_device_context().stream(); + LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr); + thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(), + num_erased.begin() + 1); + + // Copy LoD to GPU + auto lod0 = lod[0]; + auto lod_len = lod0.size(); + const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace()); + + // Calc output LoD + thrust::device_vector dev_out_lod(lod_len); + size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data()); + GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); + // Set LoD for output + std::vector out_lod0(dev_out_lod.begin(), dev_out_lod.end()); + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out->set_lod(out_lod); + + // Set output + out->Resize({static_cast(out_lod0.back()), 1}); + auto out_dat = out->mutable_data(ctx.GetPlace()); + SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len, + num_erased_ptr, out_dat); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(sequence_erase, + paddle::operators::SequenceEraseOpCUDAKernel, + paddle::operators::SequenceEraseOpCUDAKernel); diff --git a/paddle/fluid/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e151279c7fc20d5e04048080a9432cb723334b75 --- /dev/null +++ b/paddle/fluid/operators/sequence_erase_op.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class SequenceEraseKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = in->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(), + "The actual size mismatches with the LoD information."); + auto tokens = ctx.Attr>("tokens"); + auto in_len = in->numel(); + auto in_dat = in->data(); + auto lod0 = lod[0]; + + std::vector num_erased(in_len + 1, 0); + std::vector out_lod0(1, 0); + for (size_t i = 0; i < lod0.size() - 1; ++i) { + size_t num_out = 0; + for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) { + num_erased[j] = num_erased[j - 1]; + if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) != + tokens.end()) { + num_erased[j] += 1; + } else { + num_out += 1; + } + } + out_lod0.push_back(out_lod0.back() + num_out); + } + + auto out_len = in_len - num_erased[in_len]; + out->Resize({static_cast(out_len), 1}); + auto out_dat = out->mutable_data(ctx.GetPlace()); + + for (int64_t i = 0; i < in_len; ++i) { + if (num_erased[i] == num_erased[i + 1]) { + out_dat[i - num_erased[i]] = in_dat[i]; + } + } + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out->set_lod(out_lod); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ebce641d2876a4f2329b6d7d7263a6b2a31fcf6 --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_op.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_expand_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SequenceExpandOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasOutput("Out")); + PADDLE_ENFORCE(ctx->HasInput("Y")); + framework::DDim out_dim; + out_dim = ctx->GetInputDim("Y"); + ctx->ShareLoD("Y", "Out"); + ctx->SetOutputDim("Out", out_dim); + } +}; + +class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor or LoDTensor) The input(X) of this operator can be a " + "LoDTensor or a base Tensor."); + AddInput("Y", + "(LoDTensor)The reference input(Y) of sequence_expand op." + "It must be a LoDTensor with k-level(k>0)." + "The input(X) will be expanded according to LOD of input(Y)." + "The element numbers of last level in input(Y) " + "must be equal to dims[0] of input(X)."); + AddOutput("Out", + "(LodTensor)The output of sequence_expand op." + "The lod of output will be as same as input(Y)'s lod."); + AddComment(R"DOC( +Sequence Expand Operator. + +This operator expands input(X) according to LOD of input(Y). +Following are cases to better explain how this works: +Case 1: + +Given a 2-level LoDTensor input(X) + X.lod = [[0, 2, 3], + [0, 1, 3, 4]] + X.data = [a, b, c, d] + X.dims = [4, 1] +and input(Y) + Y.lod = [[0, 2, 4], + [0, 3, 6, 7, 8]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 2-level LoDTensor + Out.lod = [[0, 2, 4], + [0, 3, 6, 7, 8]] + Out.data = [a, a, a, b, b, b, c, d] + Out.dims = [8, 1] + +Case 2: + +Given a common Tensor input(X) + X.data = [a, b, c] + X.dims = [3, 1] +and input(Y) + Y.lod = [[0, 2, 3, 6]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 1-level LoDTensor + Out.lod = [[0, 2, 3, 6]] + Out.data = [a, a, b, c, c, c] + Out.dims = [6, 1] + +Case 3: + +Given a common Tensor input(X) + X.data = [[a, b], [c, d], [e, f]] + X.dims = [3, 2] +and input(Y) + Y.lod = [[0, 2, 3, 6]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 1-level LoDTensor + Out.lod = [[0, 2, 3, 6]] + Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]] + Out.dims = [6, 2] + +Case 4: + +Given 2-level a LoDTensor input(X) + X.lod = [[0, 2, 3], + [0, 1, 3, 4]] + X.data = [a, b, c, d] + X.dims = [4, 1] +and input(Y) + Y.lod = [[0, 2, 4], + [0, 3, 6, 6, 8]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 2-level LoDTensor + Out.lod = [[0, 2, 4], + [0, 3, 6, 6, 8]] + Out.data = [a, a, a, b, b, b, d, d] + Out.dims = [8, 1] + + +)DOC"); + } +}; + +class SequenceExpandOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Out")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker, + sequence_expand_grad, ops::SequenceExpandOpGrad); +REGISTER_OP_CPU_KERNEL( + sequence_expand, + ops::SequenceExpandKernel); +REGISTER_OP_CPU_KERNEL( + sequence_expand_grad, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ac76d83da618680502d0add51ae68ac117ad2aa --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sequence_expand_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_expand, + ops::SequenceExpandKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_expand_grad, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8010627ff6f5acbf300b0f3f9281e60b4ebfa94e --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +template +class SequenceExpandKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + const T* x_data = x->data(); + auto x_dims = x->dims(); + auto* y = context.Input("Y"); + PADDLE_ENFORCE(!y->lod().empty(), "y should have lod"); + PADDLE_ENFORCE_EQ(static_cast(x_dims[0]), + y->lod().back().size() - 1, + "The size of last lod level in Input(Y)" + "must be equal to dims[0] of Input(X)."); + out->set_lod(y->lod()); + auto* place = + context.template device_context().eigen_device(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); + + for (size_t i = 0; i < out_starts.size() - 1; i++) { + int scale = out_starts[i + 1] - out_starts[i]; + Eigen::TensorMap< + Eigen::Tensor> + x_t(x_data, 1, element_len); + Eigen::TensorMap> + out_t(out_data, scale, element_len); + Eigen::array cast({{scale, 1}}); + out_t.device(*place) = x_t.broadcast(cast); + x_data += element_len; + out_data += element_len * scale; + } + } +}; + +/* + *Given Grad(Out) + * + * Grad(Out).lod = [[0, 2], + * [0, 3, 6]] + * Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] + * Then + * Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)] + * = [0.6, 1.5] + * Grad(X).lod = Input(X).lod + * + * */ +template +class SequenceExpandGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* x = context.Input("X"); + auto* out = context.Input("Out"); + auto* d_x = context.Output(framework::GradVarName("X")); + auto out_last_level = out->lod().back(); + d_x->set_lod(x->lod()); + const T* d_out_data = d_out->data(); + T* d_x_data = d_x->mutable_data(context.GetPlace()); + size_t element_len = d_out->numel() / d_out->dims()[0]; + for (size_t i = 0; i < out_last_level.size() - 1; ++i) { + size_t repeat = out_last_level[i + 1] - out_last_level[i]; + Eigen::TensorMap< + Eigen::Tensor> + d_out_t(d_out_data, static_cast(repeat), element_len); + Eigen::TensorMap> + d_x_t(d_x_data, static_cast(element_len)); + auto place = + context.template device_context().eigen_device(); + d_x_t.device(*place) = d_out_t.sum(Eigen::array({{0}})); + d_out_data += (repeat * element_len); + d_x_data += element_len; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cfb336b2e0b31ab20182a36d806506b6af4c139 --- /dev/null +++ b/paddle/fluid/operators/sequence_pool_op.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_pool_op.h" + +namespace paddle { +namespace operators { + +class SequencePoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequencePoolOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + if (ctx->Attrs().Get("pooltype") == "MAX") { + PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"), + "Output(MaxIndex) of SequencePoolOp should not be null."); + ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X")); + } + } +}; + +class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp"); + AddOutput("Out", + "(Tensor) The output of SequencePoolOp does not contain LoD " + "infomation."); + AddOutput("MaxIndex", + "(Tensor) This tensor is used for the sequence max-pooling " + "to record the max indexes.") + .AsIntermediate(); + AddAttr( + "pooltype", + "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") + .SetDefault("AVERAGE") + .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"}); + AddComment(R"DOC( +Sequence Pool Operator. + +The SequencePoolOp pools features of all time-steps of each instance. +It supports six pooling types: +1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$ +2. SUM: $$Out[i] = \sum_jX_{ij}$$ +3. SQRT: $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$ +4. LAST: Out[i] = last instance in i-th sequence X[i] +5. FIRST: Out[i] = first instance in i-th sequence X[i] +6. MAX: $$Out[i] = max(X_i)$$ + +The following example explains how this works: +For a mini-batch of 3 variable-length sentences, +containing 2, 3, and 2 time-steps: + +Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2. +Besides, for the sake of simplicity, we assume M=1 and N=1, +and the value of X = [[1, 3], [2, 4, 6], [5, 1]]. + +Thus, Out is a [3,1,1] Tensor without LoD infomation. +And for different pooltype, the value of Out is as follows: + +- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2 +- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1 +- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), + 6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2) +- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1) +- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1) +- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1) + + )DOC"); + } +}; + +class SequencePoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null."); + auto og_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(), + "The rank of output grad must equal to Input(X)."); + for (int64_t i = 1; i < og_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch."); + } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_pool_grad"); + op_desc_ptr->SetInput("X", Input("X")); + if (boost::get(GetAttr("pooltype")) == "MAX") { + op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex")); + } + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, + ops::SequencePoolGradOpMaker); +REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CPU_KERNEL( + sequence_pool_grad, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_pool_op.cu b/paddle/fluid/operators/sequence_pool_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..364769c39bd1b94935630eb8c16c0e27787139e1 --- /dev/null +++ b/paddle/fluid/operators/sequence_pool_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/sequence_pool_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_pool_grad, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7b67e6201ebb04b3fbda3520347c580fd9501098 --- /dev/null +++ b/paddle/fluid/operators/sequence_pool_op.h @@ -0,0 +1,155 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_pooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +class SequencePoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + std::string pooltype = context.Attr("pooltype"); + + auto dims = in->dims(); + auto lod = in->lod(); + int64_t w = in->numel() / dims[0]; + + // InferShape by lod + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_GE( + dims[0], + /*batch size = */ static_cast(lod[0].size() - 1), + "The first dimension of Input(X) must be large than batch size."); + dims[0] = lod[0].size() - 1; + out->Resize({dims}); + + auto lod_level_0 = lod[0]; + + out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + if (pooltype == "MAX") { + math::MaxSeqPoolFunctor max_pool; + auto* index = context.Output("MaxIndex"); + index->Resize({dims}); + index->mutable_data(context.GetPlace()); + max_pool(dev_ctx, *in, out, index); + return; + } + + auto& place = + *context.template device_context().eigen_device(); + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + Tensor in_t = in->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + Tensor out_t = out->Slice(i, i + 1); + int64_t h = static_cast(lod_level_0[i + 1] - lod_level_0[i]); + auto in_e = EigenMatrix::From(in_t, framework::make_ddim({h, w})); + auto out_e = EigenVector::Flatten(out_t); + + if (pooltype == "AVERAGE") { + out_e.device(place) = in_e.mean(Eigen::array({{0}})); + } else if (pooltype == "SUM") { + out_e.device(place) = in_e.sum(Eigen::array({{0}})); + } else if (pooltype == "SQRT") { + out_e.device(place) = in_e.sum(Eigen::array({{0}})) / + std::sqrt(static_cast(h)); + } else if (pooltype == "LAST") { + out_e.device(place) = in_e.chip(h - 1, 0); + } else if (pooltype == "FIRST") { + out_e.device(place) = in_e.chip(0, 0); + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } + } + } +}; + +template +class SequencePoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out_g = context.Input(framework::GradVarName("Out")); + auto* in_g = context.Output(framework::GradVarName("X")); + std::string pooltype = context.Attr("pooltype"); + + auto dims = in->dims(); + auto lod = in->lod()[0]; + int64_t w = in->numel() / dims[0]; + + in_g->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + + if (pooltype == "MAX") { + math::MaxSeqPoolGradFunctor max_pool_grad; + auto* index = context.Input("MaxIndex"); + max_pool_grad(dev_ctx, *out_g, *index, in_g); + return; + } + + if (pooltype == "LAST" || pooltype == "FIRST") { + // set X@Grad be zero at first when pooltype is LAST/FIRST + math::SetConstant functor; + functor(dev_ctx, in_g, 0); + } + auto& place = + *context.template device_context().eigen_device(); + + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + auto in_g_t = + in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); + auto out_g_t = out_g->Slice(i, i + 1); + int64_t h = static_cast(lod[i + 1] - lod[i]); + auto in_g_e = EigenMatrix::From(in_g_t, {h, w}); + auto out_g_e = EigenMatrix::From(out_g_t, {1, w}); + auto out_g_e_v = EigenVector::Flatten(out_g_t); + Eigen::DSizes bcast(h, 1); + + if (pooltype == "AVERAGE") { + in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); + } else if (pooltype == "SUM") { + in_g_e.device(place) = (out_g_e).broadcast(bcast); + } else if (pooltype == "SQRT") { + in_g_e.device(place) = + (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); + } else if (pooltype == "LAST") { + in_g_e.chip(h - 1, 0).device(place) = out_g_e_v; + } else if (pooltype == "FIRST") { + in_g_e.chip(0, 0).device(place) = out_g_e_v; + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4e42d3eeb5555be693946ccde30ef87f88d0f32 --- /dev/null +++ b/paddle/fluid/operators/sequence_reshape_op.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_reshape_op.h" +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class SequenceReshapeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceReshapeOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_numel = product(x_dims); + PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2."); + int new_dim = ctx->Attrs().Get("new_dim"); + if (ctx->IsRuntime()) { + ctx->SetOutputDim("Out", + {x_numel / new_dim, static_cast(new_dim)}); + } else { + // when compiling, the batch size is undetermined, just set to -1 + ctx->SetOutputDim("Out", {-1, static_cast(new_dim)}); + } + } +}; + +class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, default LoDTensor) A 2-D LoDTensor with shape " + "being [N, M]."); + AddOutput("Out", + "(LoDTensor, default LoDTensor) A 2-D LoDTensor with " + "shape [T, new_dim] where T is calculated based on X.lod, M and " + "new_dim."); + AddAttr("new_dim", "Sequence dimension of the output LoDTensor."); + AddComment(R"DOC( +Sequence Reshape Operator. + +This operator will rearrange the input sequences. The new dimension is set by +attribute and length of each sequence may change longer or shorter which is +decided by original length, original dimension and new dimension. The following +example will help to illustrate the function of this operator: + +x is a LoDTensor: + x.lod = [[0, 2, 6]] + x.data = [[1, 2], [3, 4], + [5, 6], [7, 8], [9, 10], [11, 12]] + x.dims = [6, 2] + +set new_dim = 4 + +then out is a LoDTensor: + out.lod = [[0, 1, 3]] + out.data = [[1, 2, 3, 4], + [5, 6, 7, 8], [9, 10, 11, 12]] + out.dims = [3, 4] + +Currently, only 1-level LoDTensor is supported and please make sure (original +length * original dimension) can be divided by new_dim with no remainder for +each sequence. + +)DOC"); + } +}; + +class SequenceReshapeGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceReshapeGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceReshapeGradOp should not be null."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } +}; + +class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_reshape_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp, + ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker); +REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); +REGISTER_OP_CPU_KERNEL( + sequence_reshape_grad, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_reshape_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ca3497396eaa6c811c69af4acf4fa3092cff42a --- /dev/null +++ b/paddle/fluid/operators/sequence_reshape_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_reshape_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_reshape_grad, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_reshape_op.h b/paddle/fluid/operators/sequence_reshape_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7a5d1261da917c6e596ff7b85afbfd95ff90f12a --- /dev/null +++ b/paddle/fluid/operators/sequence_reshape_op.h @@ -0,0 +1,86 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +template +class SequenceReshapeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int out_width = context.Attr("new_dim"); + + auto in_dims = in->dims(); + int64_t in_width = in_dims[1]; + auto& in_lod = in->lod(); + + PADDLE_ENFORCE_EQ(in_lod.size(), 1UL, + "Only support one level sequence now."); + PADDLE_ENFORCE_EQ( + (uint64_t)in_dims[0], in_lod[0].back(), + "Inconsistent size between X.shape[0] and X.lod()[0].back()."); + + auto in_lod_l0 = in_lod[0]; + int seq_num = in_lod_l0.size() - 1; + + if (in_width == out_width) { + out->set_lod(in->lod()); + } else { + auto& out_lod = *out->mutable_lod(); + out_lod.resize(1); + out_lod[0].resize(seq_num + 1); + out_lod[0][0] = 0; + for (int i = 0; i < seq_num; ++i) { + size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i]; + size_t offset = 0; + offset = (seq_len * in_width) / out_width; + PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width, + "Please make sure (sequence_length * dimension) can " + "be divided by new_dim with no remainder for each " + "sequence. The %dth sequence is invalid.", + i + 1); + out_lod[0][i + 1] = out_lod[0][i] + offset; + } + } + + framework::Copy(*in, context.GetPlace(), out); + out->Resize({static_cast(out->lod()[0].back()), out_width}); + } +}; + +template +class SequenceReshapeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x_tensor_ptr = context.Input("X"); + auto* outg_tensor_ptr = + context.Input(framework::GradVarName("Out")); + auto* xg_tensor_ptr = + context.Output(framework::GradVarName("X")); + + xg_tensor_ptr->mutable_data(context.GetPlace()); + framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr); + xg_tensor_ptr->Resize(x_tensor_ptr->dims()); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..87b8eff64621290ebd75d2cb76d7c684655b884f --- /dev/null +++ b/paddle/fluid/operators/sequence_slice_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_slice_op.h" + +namespace paddle { +namespace operators { + +class SequenceSliceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceSliceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Offset"), + "Input(Offset) of SequenceSliceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Length"), + "Input(Length) of SequenceSliceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceSliceOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + + auto offset_dim = ctx->GetInputDim("Offset"); + auto length_dim = ctx->GetInputDim("Length"); + + PADDLE_ENFORCE_EQ( + offset_dim.size(), 2UL, + "Only support one level sequence now, The rank of offset must be 2."); + PADDLE_ENFORCE_EQ( + length_dim.size(), 2UL, + "Only support one level sequence now, The rank of Length must be 2."); + + // Initialize the output's dims to maximum, + // and re-set to real dims by the value of Offset and Length at kernel + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class SequenceSliceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), " + "the input of SequenceSliceOp."); + AddInput("Offset", + "(Tensor), " + "a vector to describe the offset of every input sequence for " + "sub sequence item."); + AddInput("Length", + "(Tensor), " + "a vector to describe the length of every input sequence for " + "sub sequence item."); + AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp."); + AddComment(R"DOC( +Sequence slice operator + +The operator crops a subsequence from given sequence with given start offset and subsequence length. +It only supports sequence (LoD Tensor with level number is 1). +- Case: + X = [[a1, a2; + b1, b2; + c1, c2] + [d1, d2; + e1, e2]] + LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2) + Offset = [[0], [1]]; Length = [[2], [1]] + + Out = [[a1, a2; + b1, b2] + [e1, e2]] + LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2) +NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker, + sequence_slice_grad, ops::SequenceSliceGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_slice, + ops::SequenceSliceOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_slice_grad, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu new file mode 100755 index 0000000000000000000000000000000000000000..041fabdf9a2dc73540ab45e5e86aa1ef71bed4dc --- /dev/null +++ b/paddle/fluid/operators/sequence_slice_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_slice_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_slice, + ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_slice_grad, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h new file mode 100644 index 0000000000000000000000000000000000000000..65c36a32aa12c628db5c4f0c104d3977e625ad97 --- /dev/null +++ b/paddle/fluid/operators/sequence_slice_op.h @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, + const int64_t* length_data) { + auto out_lod = in.lod(); + size_t lod_offset = 0; + + auto n = in.lod()[0].size() - 1; + out_lod[0][0] = 0; + for (size_t i = 0; i < n; ++i) { + lod_offset += length_data[i]; + out_lod[0][i + 1] = lod_offset; + } + return out_lod; +} + +template +class SequenceSliceOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* offset = ctx.Input("Offset"); + auto* length = ctx.Input("Length"); + auto* out = ctx.Output("Out"); + + auto lod = in->lod(); + auto n = lod[0].size() - 1; + + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_EQ( + n, static_cast(length->dims()[0]), + "The size of input-sequence and length-array should be the same"); + PADDLE_ENFORCE_EQ( + n, static_cast(offset->dims()[0]), + "The size of input-sequence and offset-array should be the same"); + + const int64_t* offset_data = offset->data(); + const int64_t* length_data = length->data(); + framework::Tensor offset_cpu; + framework::Tensor length_cpu; + + if (platform::is_gpu_place(ctx.GetPlace())) { + offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); + framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(), + &offset_cpu); + offset_data = offset_cpu.data(); + + length_cpu.mutable_data(length->dims(), platform::CPUPlace()); + framework::Copy(*length, platform::CPUPlace(), ctx.device_context(), + &length_cpu); + length_data = length_cpu.data(); + } + + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_LT(0, offset_data[i], + "The offset[%d] must greater than zero.", i); + PADDLE_ENFORCE_LT(0, length_data[i], + "The length[%d] must greater than zero.", i); + PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], + lod[0][i + 1], "The target tensor's length overflow."); + } + + out->mutable_data(ctx.GetPlace()); + auto out_lod = SequenceSliceLoD(*in, offset_data, length_data); + auto out_dims = in->dims(); + out_dims[0] = out_lod[0][out_lod[0].size() - 1]; + out->Resize(out_dims); + out->set_lod(out_lod); + + auto in_stride = framework::stride(in->dims()); + auto out_stride = framework::stride(out->dims()); + + size_t out_offset = 0; + for (size_t i = 0; i < n; ++i) { + Tensor in_t = in->Slice( + static_cast(lod[0][i] + offset_data[i]), + static_cast(lod[0][i] + offset_data[i] + length_data[i])); + + StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, + in_t.dims(), out_stride, out->data() + out_offset); + out_offset += length_data[i] * in_stride[0]; + } + } +}; + +template +class SequenceSliceGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* offset = ctx.Input("Offset"); + auto* length = ctx.Input("Length"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + + const int64_t* offset_data = offset->data(); + const int64_t* length_data = length->data(); + framework::Tensor offset_cpu; + framework::Tensor length_cpu; + + if (platform::is_gpu_place(ctx.GetPlace())) { + offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); + framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(), + &offset_cpu); + offset_data = offset_cpu.data(); + + length_cpu.mutable_data(length->dims(), platform::CPUPlace()); + framework::Copy(*length, platform::CPUPlace(), ctx.device_context(), + &length_cpu); + length_data = length_cpu.data(); + } + + auto lod = in->lod(); + auto out_lod = out_grad->lod(); + + if (x_grad) { + x_grad->mutable_data(ctx.GetPlace()); + x_grad->set_lod(in->lod()); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), x_grad, + static_cast(0)); + + auto out_grad_stride = framework::stride(out_grad->dims()); + + for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { + Tensor out_grad_t = + out_grad->Slice(static_cast(out_lod[0][i]), + static_cast(out_lod[0][i + 1])); + auto out_grad_stride = framework::stride(out_grad_t.dims()); + + auto x_grad_stride = framework::stride(x_grad->dims()); + + Tensor x_grad_t = x_grad->Slice( + static_cast(lod[0][i] + offset_data[i]), + static_cast(lod[0][i] + offset_data[i] + length_data[i])); + + StridedMemcpy(ctx.device_context(), out_grad_t.data(), + out_grad_stride, out_grad_t.dims(), x_grad_stride, + x_grad_t.data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f966b7162077943dd78d601743b3a3e2e103444b --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_softmax_op.h" + +namespace paddle { +namespace operators { + +class SequenceSoftmaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceSoftmaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceSoftmaxOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension " + "of length 1."); + AddOutput("Out", + "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension " + "of length 1."); + AddComment(R"DOC( +Sequence Softmax Operator. + +SequenceSoftmaxOp computes the softmax activation among all time-steps for each +sequence. The dimension of each time-step should be 1. Thus, the shape of +input Tensor can be either [N, 1] or [N], where N is the sum of the length +of all sequences. + +The algorithm works as follows: + + for i-th sequence in a mini-batch: + +$$ +Out(X[lod[i]:lod[i+1]], :) = \ +\frac{\exp(X[lod[i]:lod[i+1], :])} \ +{\sum(\exp(X[lod[i]:lod[i+1], :]))} +$$ + +For example, for a mini-batch of 3 sequences with variable-length, +each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7], +then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :] +and N turns out to be 7. + +)DOC"); + } +}; + +class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), + "Input(Out) of SequenceSoftmaxGradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceSoftmaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) of SequenceSoftmaxOp should not be null."); + + PADDLE_ENFORCE_EQ( + ctx->GetInputDim("Out"), + ctx->GetInputDim(framework::GradVarName("Out")), + "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of " + "the same shape."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, + ops::SequenceSoftmaxOpMaker, sequence_softmax_grad, + ops::SequenceSoftmaxGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_softmax, + ops::SequenceSoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + sequence_softmax_grad, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c42dfd7540954616eb7bf012160a98211c3caf1b --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_softmax_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_softmax, + ops::SequenceSoftmaxKernel) +REGISTER_OP_CUDA_KERNEL( + sequence_softmax_grad, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e6c21c67b3362835b2ff87045a213b2636556346 --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceSoftmaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = x->lod(); + auto dims = x->dims(); + + const size_t level = lod.size() - 1; + PADDLE_ENFORCE_EQ(dims[0], static_cast(lod[level].back()), + "The first dimension of Input(X) should be equal to the " + "sum of all sequences' lengths."); + PADDLE_ENFORCE_EQ(dims[0], x->numel(), + "The width of each timestep in Input(X) of " + "SequenceSoftmaxOp should be 1."); + + out->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor x_i = x->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); + x_i.Resize(dims_i); + out_i.Resize(dims_i); + math::SoftmaxFunctor()( + ctx.template device_context(), &x_i, &out_i); + } + } +}; + +template +class SequenceSoftmaxGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + + auto lod = x->lod(); + const size_t level = lod.size() - 1; + + x_grad->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + + Tensor out_i = out->Slice(start_pos, end_pos); + Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); + Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); + out_i.Resize(dims_i); + out_grad_i.Resize(dims_i); + x_grad_i.Resize(dims_i); + math::SoftmaxGradFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, + &x_grad_i); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1e23a62f4ec52b40cfa1febc98fbfb045f45efd --- /dev/null +++ b/paddle/fluid/operators/sgd_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sgd_op.h" + +namespace paddle { +namespace operators { + +class SGDOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of SGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of SGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of SGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of SGDOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 element"); + auto param_dim = ctx->GetInputDim("Param"); + // TODO(qijun): check dimensions of Param and Grad at complie + // and run time. + ctx->SetOutputDim("ParamOut", param_dim); + } +}; + +class SGDOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("LearningRate", "(Tensor) Learning rate of SGD"); + AddInput("Grad", "(Tensor) Input gradient"); + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddComment(R"DOC( + +SGD operator + +This operator implements one step of the stochastic gradient descent algorithm. + +$$param\_out = param - learning\_rate * grad$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); +REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, ops::SGDOpKernel); diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..09374e20494be2eebba913bd90a7c32e1aa0015b --- /dev/null +++ b/paddle/fluid/operators/sgd_op.cu @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sgd_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, + const int num, T* p_out) { + T lr = learning_rate[0]; + int grid_size = blockDim.x * gridDim.x; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) { + T g_data = g[i]; + T p_data = p[i]; + p_out[i] = p_data - lr * g_data; + } +} + +template +__global__ void SparseSGDFunctorKernel(const T* selected_rows, + const int64_t* rows, + const T* learning_rate, T* tensor_out, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); + } +} +} // namespace + +template +class SGDOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param = ctx.Input("Param"); + auto* param_out = ctx.Output("ParamOut"); + auto* learning_rate = ctx.Input("LearningRate"); + + auto* grad_var = ctx.InputVar("Grad"); + // Actually, all tensors are LoDTensor except SelectedRows. + if (grad_var->IsType()) { + param_out->mutable_data(ctx.GetPlace()); + auto* grad = ctx.Input("Grad"); + auto* grad_data = grad->data(); + auto* param_data = param->data(); + auto* param_out_data = param_out->data(); + + int block = 512; + int grid = (param->numel() + block - 1) / block; + + SGDKernel<<>>( + grad_data, param_data, learning_rate->data(), param->numel(), + param_out_data); + + } else if (grad_var->IsType()) { + // TODO(qijun): In Sparse SGD operator, in-place update is enforced. + // This manual optimization brings difficulty to track data dependency. + // It's better to find a more elegant solution. + PADDLE_ENFORCE_EQ(param, param_out); + auto* grad = ctx.Input("Grad"); + + auto in_height = grad->height(); + auto out_dims = param_out->dims(); + PADDLE_ENFORCE_EQ(in_height, out_dims[0]); + + auto& in_value = grad->value(); + framework::Vector in_rows(grad->rows()); + + int64_t in_row_numel = in_value.numel() / in_rows.size(); + PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); + + auto* in_data = in_value.data(); + auto* out_data = param_out->data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in_rows.size()); + SparseSGDFunctorKernel< + T, 256><<>>( + in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), + out_data, in_row_numel); + + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel, + ops::SGDOpCUDAKernel); diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f1eaaecdb1eef1b42ea5d3b7315133c665b50df6 --- /dev/null +++ b/paddle/fluid/operators/sgd_op.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace operators { + +template +class SGDOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param = ctx.Input("Param"); + auto* param_out = ctx.Output("ParamOut"); + auto* learning_rate = ctx.Input("LearningRate"); + + auto* grad_var = ctx.InputVar("Grad"); + // Actually, all tensors are LoDTensor except SelectedRows. + if (grad_var->IsType()) { + param_out->mutable_data(ctx.GetPlace()); + auto* grad = ctx.Input("Grad"); + + auto p = framework::EigenVector::Flatten(*param); + auto g = framework::EigenVector::Flatten(*grad); + auto o = framework::EigenVector::Flatten(*param_out); + auto* lr = learning_rate->data(); + + o = p - lr[0] * g; + } else if (grad_var->IsType()) { + // TODO(qijun): In Sparse SGD operator, in-place update is enforced. + // This manual optimization brings difficulty to track data dependency. + // It's better to find a more elegant solution. + PADDLE_ENFORCE_EQ(param, param_out); + auto* grad = ctx.Input("Grad"); + + auto in_height = grad->height(); + auto out_dims = param_out->dims(); + PADDLE_ENFORCE_EQ(in_height, out_dims[0]); + + auto& in_value = grad->value(); + auto& in_rows = grad->rows(); + + int64_t in_row_numel = in_value.numel() / in_rows.size(); + PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); + + auto* in_data = in_value.data(); + auto* out_data = param_out->data(); + auto* lr = learning_rate->data(); + + for (size_t i = 0; i < in_rows.size(); i++) { + for (int64_t j = 0; j < in_row_numel; j++) { + out_data[in_rows[i] * in_row_numel + j] -= + lr[0] * in_data[i * in_row_numel + j]; + } + } + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..df50a324fde1637f1f9f64a0b0d4eff8ba3f26d2 --- /dev/null +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/array_operator.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +class ShrinkRNNMemoryOp : public ArrayOp { + public: + ShrinkRNNMemoryOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x_var = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x_var != nullptr, "Input X must be set"); + auto &x_tensor = x_var->Get(); + size_t offset = this->GetOffset(scope, place); + auto *rank_table_var = scope.FindVar(Input("RankTable")); + PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set"); + auto &rank_table = rank_table_var->Get(); + + auto &rank_items = rank_table.items(); + int dst_num_rows = + std::lower_bound(rank_items.begin(), rank_items.end(), offset, + [](const framework::LoDRankTable::TableItem &a, + size_t b) { return a.length > b; }) - + rank_items.begin(); + + auto *out_var = scope.FindVar(Output("Out")); + PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set."); + auto &out_tensor = *out_var->GetMutable(); + + size_t height = dst_num_rows; + + // do shrink for the top level LoD + if (x_tensor.lod().size() > 0 && + x_tensor.lod()[0].size() > static_cast(dst_num_rows)) { + auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0, + dst_num_rows, 0); + height = lod_offset.second.second; + auto out_lod = out_tensor.mutable_lod(); + framework::AppendLoD(out_lod, lod_offset.first); + } + + if (dst_num_rows != 0) { + out_tensor.ShareDataWith(x_tensor.Slice(0, height)); + } + } +}; + +class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The RNN step memory to be shrinked."); + AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN."); + AddInput("I", + "(LoDTensor) The step index. The RNN step memory 'X' will be " + "shrinked to match the size of the input of the index'th step."); + AddOutput("Out", "(LoDTensor) The shrinked RNN step memory."); + AddComment(R"DOC( +This operator is used to shrink output batch of memory defined in dynamic RNN. + +Dynamic RNN is able to handle variable-length sequences, in which, sequences in +a mini-batch are sorted by their lengths first. After that, the longest sequence +becomes the first one in the sorted batch, followed by the second longest, the +third longest, and so on. Dynamic RNN then slices a batch input timestep by +timestep from the sorted input. Once any sequence in the input batch reaches its +end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input +batch size for the next time step. +)DOC"); + } +}; + +class ShrinkRNNMemoryInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X")); + PADDLE_ENFORCE(context->HasInput("I")); + PADDLE_ENFORCE(context->HasInput("RankTable")); + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ShrinkRNNMemoryGradOp : public ArrayOp { + public: + ShrinkRNNMemoryGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out"))); + auto *dx_var = scope.FindVar(Output(framework::GradVarName("X"))); + PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr"); + auto *x_var = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x_var != nullptr); + + auto &x_tensor = x_var->Get(); + auto &dx_tensor = *dx_var->GetMutable(); + dx_tensor.Resize(x_tensor.dims()); + dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + if (dout_var == nullptr) { // dx_tensor fill zero + math::set_constant(dev_ctx, &dx_tensor, 0.0f); + } else { + auto &dout_tensor = dout_var->Get(); + auto height = dout_tensor.dims()[0]; + auto slice = dx_tensor.Slice(0, static_cast(height)); + framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice); + if (dx_tensor.dims()[0] > height) { + auto rest_tensor = dx_tensor.Slice( + static_cast(height), static_cast(dx_tensor.dims()[0])); + math::set_constant(dev_ctx, &rest_tensor, 0.0f); + } + } + dx_tensor.set_lod(x_tensor.lod()); + } +}; + +class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X")); + PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X"))); + context->SetOutputDim(framework::GradVarName("X"), + context->GetInputDim("X")); + context->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("shrink_rnn_memory_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp, + ops::ShrinkRNNMemoryInferShape, + ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker); +REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp, + ops::ShrinkRNNMemoryGradInferShape); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3188415a2bd4434704ca95b92427094023527019 --- /dev/null +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto labels_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], + "The 2nd dimension of Input(X) and Input(Label) should " + "be equal."); + + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SigmoidCrossEntropyWithLogitsGradOp + : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto labels_dims = ctx->GetInputDim("Label"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dout_dims.size(), 2, + "Input(Out@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], + "The 2nd dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0], + "The 1st dimension of Input(X) and Input(Out@Grad) " + "should be equal."); + PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1], + "The 2nd dimension of Input(X) and Input(Out@Grad) " + "should be equal."); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } +}; + +class SigmoidCrossEntropyWithLogitsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto, + OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape N x D, " + "where N is the batch size and D is the number of classes. " + "This input is a tensor of logits computed by the previous " + " operator. Logits are unscaled log probabilities given as " + "log(p/(1-p))."); + AddInput("Label", + "(Tensor, default Tensor), a 2-D tensor of the same type " + "and shape as X. This input is a tensor of probabalistic labels " + "for each logit"); + AddOutput("Out", + "(Tensor, default Tensor), a 2-D tensor with shape N x D " + " of elementwise logistic losses."); + AddComment(R"DOC( +SigmoidCrossEntropyWithLogits Operator. + +This measures the element-wise probability error in classification tasks +in which each class is independent. This can be thought of as predicting labels +for a data-point, where labels are not mutually exclusive. +For example, a news article can be about politics, technology or sports +at the same time or none of these. + +The logistic loss is given as follows: + + $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$ + +We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get: + + $$loss = X - X * Labels + \log(1 + \exp(-X))$$ + +For stability and to prevent overflow of $$\exp(-X)$$ when X < 0, +we reformulate the loss as follows: + + $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$ + +Both the input `X` and `Labels` can carry the LoD (Level of Details) information. +However the output only shares the LoD with input `X`. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsOp, + ops::SigmoidCrossEntropyWithLogitsOpMaker, + sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradOp); +REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CPUDeviceContext, float>); +REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..daa9d3e4fa5aeba77f770f69d6057ced98741eaa --- /dev/null +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h new file mode 100644 index 0000000000000000000000000000000000000000..977849f7627bc3f5b08a5f34bd300ab1442c6276 --- /dev/null +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) +template +class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + const framework::Tensor *Labels = context.Input("Label"); + framework::Tensor *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto labels = framework::EigenVector::Flatten(*Labels); + auto out = framework::EigenVector::Flatten(*Out); + auto &place = *context.device_context().eigen_device(); + + // term1 = max(x, 0) + auto term1 = x.cwiseMax(static_cast(0)); + // term2 = x * labels + auto term2 = x * labels; + // term3 = log(1 + exp(-abs(x))) + auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); + + out.device(place) = term1 - term2 + term3; + } +}; + +// dX = sigmoid(X) - labels +template +class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + const framework::Tensor *Labels = context.Input("Label"); + const framework::Tensor *dOut = + context.Input(framework::GradVarName("Out")); + framework::Tensor *dX = + context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto labels = framework::EigenVector::Flatten(*Labels); + auto dout = framework::EigenVector::Flatten(*dOut); + auto dx = framework::EigenVector::Flatten(*dX); + auto &place = + *context.template device_context().eigen_device(); + + auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); + dx.device(place) = dout * (sigmoid_x - labels); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..54b962538b8426141c9ab1b9269c0ed8bd5a8496 --- /dev/null +++ b/paddle/fluid/operators/sign_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sign_op.h" + +namespace paddle { +namespace operators { + +class SignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SignOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class SignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SignOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor of sign operator."); + AddOutput("Out", "(Tensor) Output tensor of sign operator."); + AddComment(R"DOC( +Sign operator + +$$Out = X.sign()$$ +)DOC"); + } +}; + +class SignGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("scale"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("scale", 0.0f); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, + ops::SignGradMaker); +REGISTER_OP_CPU_KERNEL( + sign, ops::SignKernel); diff --git a/paddle/fluid/operators/sign_op.cu b/paddle/fluid/operators/sign_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..93cdb311eb4961a7754f9adfe14a15f3b2d0ca58 --- /dev/null +++ b/paddle/fluid/operators/sign_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sign_op.h" + +REGISTER_OP_CUDA_KERNEL( + sign, + paddle::operators::SignKernel); diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c2ebebee40d9b64dd8b658b904e631ba294e41e --- /dev/null +++ b/paddle/fluid/operators/sign_op.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class SignKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* out = context.Output("Out"); + auto* in = context.Input("X"); + out->mutable_data(in->place()); + + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto& place = + *context.template device_context().eigen_device(); + eigen_out.device(place) = eigen_in.sign(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..be4c7a56a84e84c39a578b958fe7c9ad551f54f6 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/smooth_l1_loss_op.h" + +namespace paddle { +namespace operators { + +class SmoothL1LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(x_dims, y_dims); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The tensor rank of Input(X) should not be less than 2."); + if (ctx->HasInput("InsideWeight")) { + PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"), + "If weights are provided, must specify both " + "inside and outside weights."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims); + } + + ctx->SetOutputDim("Diff", x_dims); + // loss is a two-rank tensor + ctx->SetOutputDim("Out", {x_dims[0], 1}); + } +}; + +template +class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "The input value of smooth l1 loss op with shape " + "[batch_size, dim1, ..., dimN]."); + AddInput("Y", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "The target value of smooth l1 loss op with same shape as X."); + AddInput("InsideWeight", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "This input is optional and should have same shape with X. " + "If provided, the result of (X - Y) will be multiplied " + "by this tensor element by element.") + .AsDispensable(); + AddInput("OutsideWeight", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "This input is optional and should have same shape with X. " + "If provided, the out smooth l1 loss will be multiplied by this " + "tensor element by element.") + .AsDispensable(); + AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).") + .AsIntermediate(); + AddOutput("Out", + "(Tensor, default Tensor) A tensor with rank be 2. " + "The output smooth l1 loss with shape [batch_size, 1]."); + AddAttr("sigma", + "Hyper parameter of smooth l1 loss op." + "A float scalar with default value 3.0.") + .SetDefault(3.0); + AddComment(R"DOC( +Smooth L1 Loss Operator. + +This operator computes the smooth l1 loss for X and Y. +The operator takes the first dimension of X and Y as batch size. +For each instance, it computes the smooth l1 loss element by element first +and then sums all the losses. So the shape of Out is [batch_size, 1]. + +The equation is: +$$ +Out_{\sigma}(X, Y)_i = \begin{cases} +0.5 * (\sigma * (X_i - Y_i)) ^ 2 +\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\ +\frac{|X_i - Y_i| - 0.5}{{\sigma}^2}, +\quad otherwise +\end{cases} +$$ + +In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith +element of Out, X and Y. + +)DOC"); + } +}; + +class SmoothL1LossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto in_dims = ctx->GetInputDim("X"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(out_dims.size(), 2, + "The tensor rank of Input(Out@Grad) should be 2."); + PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0], + "The 1st dimension of Input(Out@Grad) must be " + "same as input."); + PADDLE_ENFORCE_EQ(out_dims[1], 1, + "The 2nd dimension of Input(Out@Grad) must be 1."); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, in_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, in_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, + ops::SmoothL1LossOpMaker, smooth_l1_loss_grad, + ops::SmoothL1LossGradOp); +REGISTER_OP_CPU_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CPU_KERNEL( + smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..94c0d6cd299075541f0ef66cbc0bd48a8f4d51b3 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/smooth_l1_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..325ad824e1874281873b5e41ab62db0fa43040d0 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SmoothL1LossForward { + HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val < 1.0 / sigma2) { + return 0.5 * val * val * sigma2; + } else { + return abs_val - 0.5 / sigma2; + } + } + + T sigma2; +}; + +template +class SmoothL1LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* in2 = context.Input("InsideWeight"); + auto* in3 = context.Input("OutsideWeight"); + auto* out0 = context.Output("Diff"); + auto* out1 = context.Output("Out"); + + out0->mutable_data(context.GetPlace()); + out1->mutable_data(context.GetPlace()); + auto* place = + context.template device_context().eigen_device(); + + auto sigma = static_cast(context.Attr("sigma")); + T sigma2 = sigma * sigma; + bool has_weight = (in2 != nullptr) && (in3 != nullptr); + + auto x = EigenVector::Flatten(*in0); + auto y = EigenVector::Flatten(*in1); + auto diff = EigenVector::Flatten(*out0); + + diff.device(*place) = x - y; + // multiply inside weight + if (has_weight) { + auto inside_weight = EigenVector::Flatten(*in2); + // cache diff, reused in bp + diff.device(*place) = diff * inside_weight; + } + + auto in_counts = in0->numel(); + Tensor ptensor_errors; + ptensor_errors.mutable_data({static_cast(in_counts)}, + context.GetPlace()); + auto errors = EigenVector::Flatten(ptensor_errors); + // apply smooth l1 forward + errors.device(*place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); + + // multiply outside weight + if (has_weight) { + auto outside_weight = EigenVector::Flatten(*in3); + errors.device(*place) = errors * outside_weight; + } + auto loss = EigenVector::Flatten(*out1); + // first dimension of 'X' is the number of samples + auto mat_dims = + framework::make_ddim({static_cast(in0->dims()[0]), + static_cast(in_counts / in0->dims()[0])}); + auto errors_mat_view = EigenMatrix::From(ptensor_errors, mat_dims); + loss.device(*place) = errors_mat_view.sum(Eigen::array({{1}})); + } +}; + +template +struct SmoothL1LossBackward { + HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val < 1.0 / sigma2) { + return sigma2 * val; + } else { + return (0 < val) - (val < 0); + } + } + + T sigma2; +}; + +template +class SmoothL1LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("InsideWeight"); + auto* in1 = context.Input("OutsideWeight"); + auto* in2 = context.Input("Diff"); + auto* og = context.Input(framework::GradVarName("Out")); + auto sigma = static_cast(context.Attr("sigma")); + T sigma2 = sigma * sigma; + bool has_weight = (in0 != nullptr) && (in1 != nullptr); + + auto* place = + context.template device_context().eigen_device(); + + auto in_dims = in2->dims(); + auto counts = in2->numel(); + auto cols = counts / in_dims[0]; + auto mat_dims = framework::make_ddim( + {static_cast(in_dims[0]), static_cast(cols)}); + + Tensor ptensor_diff; + ptensor_diff.mutable_data({static_cast(counts)}, + context.GetPlace()); + auto diff = EigenVector::Flatten(ptensor_diff); + // apply smooth l1 backwoard + diff.device(*place) = EigenVector::Flatten(*in2).unaryExpr( + SmoothL1LossBackward(sigma2)); + + // compute weights + Tensor ptensor_weights; + ptensor_weights.mutable_data(mat_dims, context.GetPlace()); + auto weights = EigenMatrix::From(ptensor_weights); + // initialize to 1.0 + weights.device(*place) = weights.constant(static_cast(1.0)); + if (has_weight) { + auto inside_weight = EigenMatrix::From(*in0, mat_dims); + auto outside_weight = EigenMatrix::From(*in1, mat_dims); + weights.device(*place) = inside_weight * outside_weight; + } + + // compute gradients + auto out_grad = EigenMatrix::From(*og); + auto diff_mat_view = EigenMatrix::From(ptensor_diff, mat_dims); + auto gradients = out_grad.broadcast( + Eigen::array({{1, static_cast(cols)}})) * + weights * diff_mat_view; + + auto* out0 = context.Output(framework::GradVarName("X")); + auto* out1 = context.Output(framework::GradVarName("Y")); + + if (out0) { + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenMatrix::From(*out0, mat_dims); + x_grad.device(*place) = gradients; + } + + if (out1) { + out1->mutable_data(context.GetPlace()); + auto y_grad = EigenMatrix::From(*out1, mat_dims); + y_grad.device(*place) = -1 * gradients; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d9462d08b9cc06df2d0dca568dbbe1c50dc948f --- /dev/null +++ b/paddle/fluid/operators/softmax_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +class SoftmaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SoftmaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SoftmaxOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(x_dims.size() == 2UL, + "The input of softmax op must be a matrix."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input tensor of softmax. " + "2-D with shape [batch_size, input_feature_dimensions]."); + AddOutput("Out", "The normalized values with the same shape as X."); + AddComment(R"DOC( +Softmax Operator. + +The input of the softmax operator is a 2-D tensor with shape N x K (N is the +batch_size, K is the dimension of input feature). The output tensor has the +same shape as the input tensor. + +For each row of the input tensor, the softmax operator squashes the +K-dimensional vector of arbitrary real values to a K-dimensional vector of real +values in the range [0, 1] that add up to 1. +It computes the exponential of the given dimension and the sum of exponential +values of all the other dimensions in the K-dimensional vector input. +Then the ratio of the exponential of the given dimension and the sum of +exponential values of all the other dimensions is the output of the softmax +operator. + +For each row $i$ and each column $j$ in Input(X), we have: + $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$ + +)DOC"); + } +}; + +class SoftmaxOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"), + ctx->GetInputDim(framework::GradVarName("Out")), + "Input(Out) and its gradients should have a same shape."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad, + ops::SoftmaxOpGrad); +REGISTER_OP_CPU_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c53d8a2bc82dcfafc178b67299769b2e06109eb3 --- /dev/null +++ b/paddle/fluid/operators/softmax_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9287f0231031675b09c941f19c1df1fefc993506 --- /dev/null +++ b/paddle/fluid/operators/softmax_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SoftmaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Out = context.Output("Out"); + + // allocate memory on device. + Out->mutable_data(context.GetPlace()); + + math::SoftmaxFunctor()( + context.template device_context(), X, Out); + } +}; + +template +class SoftmaxGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* Out = context.Input("Out"); + auto* dOut = context.Input(framework::GradVarName("Out")); + auto* dX = context.Output(framework::GradVarName("X")); + + // allocate memory on device. + dX->mutable_data(context.GetPlace()); + + math::SoftmaxGradFunctor()( + context.template device_context(), Out, dOut, dX); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..79d56cb97d38ebd725668442c29229ef22f5b05e --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -0,0 +1,204 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class SoftmaxWithCrossEntropyOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "(Tensor, default: Tensor), The unscaled log probabilities " + "which is a 2-D tensor with shape [N x K]. N is the batch_size, " + "and K is the class number."); + AddInput("Label", + "(Tensor) The ground truth which is a 2-D tensor. If soft_label " + "is set to false, Label is a Tensor with shape [N x 1]. If " + "soft_label is set to true, Label is a Tensor with " + "shape [N x K]."); + AddOutput( + "Softmax", + "(Tensor, default: Tensor), A 2-D tensor with shape [N x K]. " + "The outputs value of softmax activation by given the input batch, " + "which will be used in backward calculation.") + .AsIntermediate(); + AddOutput("Loss", + "(Tensor, default: Tensor), A 2-D tensor. The cross " + "entropy loss with shape [N x 1]."); + AddAttr( + "soft_label", + "(bool, default: false), A flag to indicate whether to interpretate " + "the given labels as soft labels.") + .SetDefault(false); + AddComment(R"DOC( +Softmax With Cross Entropy Operator. + +Cross entropy loss with softmax is used as the output layer extensively. This +operator computes the softmax normalized values for each row of the input +tensor, after which cross-entropy loss is computed. This provides a more +numerically stable gradient. + +Because this operator performs a softmax on logits internally, it expects +unscaled logits. This operator should not be used with the output of +softmax operator since that would produce incorrect results. + +When the attribute soft_label is set false, this operators expects mutually +exclusive hard labels, each sample in a batch is in exactly one class with a +probability of 1.0. Each sample in the batch will have a single label. + +The equation is as follows: + +1) Hard label (one-hot label, so every sample has exactly one class) + +$$Loss_j = -\text{Logit}_{Label_j} + +\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), +j = 1,..., K$$ + +2) Soft label (each sample can have a distribution over all classes) + +$$Loss_j = -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i - +\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), +j = 1,...,K$$ + +)DOC"); + } +}; + +class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Softmax"), + "Output(Softmax) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null."); + + auto logits_dims = ctx->GetInputDim("Logits"); + auto labels_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ( + logits_dims.size(), 2UL, + "The input of softmax_with_cross_entropy should be a 2-D tensor."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL, + "The labels should be a 2-D tensor."); + + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1], + "If Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(labels_dims[1], 1UL, + "If Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + + ctx->SetOutputDim("Softmax", logits_dims); + ctx->SetOutputDim("Loss", {logits_dims[0], 1}); + + ctx->ShareLoD("Logits", /*->*/ "Softmax"); + ctx->ShareLoD("Logits", /*->*/ "Loss"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@Grad) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Softmax"), + "Input(Softmax) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Output(Logits@Grad) should be not null."); + + auto softmax_dims = ctx->GetInputDim("Softmax"); + auto labels_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL, + "The labels should be a 2-D tensor."); + + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1], + "When Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(labels_dims[1], 1UL, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Softmax")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Loss"))->type()), + ctx.device_context()); + } +}; + +class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("softmax_with_cross_entropy_grad"); + grad_op->SetInput("Label", Input("Label")); + grad_op->SetInput("Softmax", Output("Softmax")); + grad_op->SetInput("Loss", Output("Loss")); + grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax")); + grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp, + ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker); +REGISTER_OPERATOR(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyOpGrad); +REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyKernel, + ops::SoftmaxWithCrossEntropyKernel); +REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradKernel, + ops::SoftmaxWithCrossEntropyGradKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..410d9e8887c593249495e08424467b4be15c9bcb --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +namespace { +template +__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad, + const int64_t* labels, const int batch_size, + const int class_num) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int sample_idx = tid / class_num; + + if (tid < batch_size) { + PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num); + logit_grad[tid * class_num + labels[tid]] -= static_cast(1.); + } + + __syncthreads(); + + if (tid < batch_size * class_num) { + logit_grad[tid] *= loss_grad[sample_idx]; + } +} + +template +__global__ void SoftCrossEntropyGradientKernel(T* logit_grad, + const T* loss_grad, + const T* labels, + const int batch_size, + const int class_num) { + int ids = blockIdx.x * blockDim.x + threadIdx.x; + if (ids < batch_size * class_num) { + int row_ids = ids / class_num; + logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]); + } +} +} // namespace + +template +class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + const Tensor* logits = context.Input("Logits"); + const Tensor* labels = context.Input("Label"); + Tensor* softmax = context.Output("Softmax"); + + Tensor* loss = context.Output("Loss"); + softmax->mutable_data(context.GetPlace()); + loss->mutable_data(context.GetPlace()); + + math::SoftmaxFunctor()( + context.cuda_device_context(), logits, softmax); + math::CrossEntropyFunctor()( + context.cuda_device_context(), loss, softmax, labels, + context.Attr("soft_label")); + } +}; + +template +class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + const Tensor* labels = context.Input("Label"); + const T* loss_grad_data = + context.Input(framework::GradVarName("Loss"))->data(); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + logit_grad->ShareDataWith(*context.Input("Softmax")); + T* logit_grad_data = logit_grad->data(); + + const int batch_size = logit_grad->dims()[0]; + const int class_num = logit_grad->dims()[1]; + int block = 512; + int grid = (batch_size * class_num + block - 1) / block; + + if (context.Attr("soft_label")) { + const T* label_data = labels->data(); + SoftCrossEntropyGradientKernel< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); + } else { + const int64_t* label_data = labels->data(); + CrossEntropyGrad< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0927efd42ceb35cc4183f84d160c44f35f6cc3f5 --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), + "This kernel only runs on CPU."); + const Tensor* logits = context.Input("Logits"); + const Tensor* labels = context.Input("Label"); + Tensor* softmax = context.Output("Softmax"); + Tensor* loss = context.Output("Loss"); + + softmax->mutable_data(context.GetPlace()); + loss->mutable_data(context.GetPlace()); + + auto& dev_ctx = + context.template device_context(); + math::SoftmaxFunctor()(dev_ctx, logits, + softmax); + math::CrossEntropyFunctor()( + dev_ctx, loss, softmax, labels, context.Attr("soft_label")); + } +}; + +template +class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* out_grad = + context.Input(framework::GradVarName("Loss")); + const Tensor* labels = context.Input("Label"); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + logit_grad->ShareDataWith(*context.Input("Softmax")); + + const int class_num = logit_grad->dims()[1]; + auto out_grad_mat = EigenMatrix::From(*out_grad); + auto logit_grad_mat = EigenMatrix::From(*logit_grad); + auto& place = *context.template device_context() + .eigen_device(); + if (context.Attr("soft_label")) { + auto lbl_mat = EigenMatrix::From(*labels); + logit_grad_mat.device(place) = + out_grad_mat.broadcast(Eigen::DSizes(1, class_num)) * + (logit_grad_mat - lbl_mat); + } else { + logit_grad_mat.device(place) = + logit_grad_mat * + out_grad_mat.broadcast(Eigen::DSizes(1, class_num)); + + const int batch_size = logit_grad->dims()[0]; + const int64_t* label_data = labels->data(); + T* logit_grad_data = logit_grad->data(); + const T* out_grad_data = out_grad->data(); + for (int i = 0; i < batch_size; ++i) { + logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f821dc54d7bbe697d3642e64dc1628ec7d966592 --- /dev/null +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct CopyRange { + size_t begin; + size_t end; +}; + +using LoD = framework::LoD; + +class SplitLoDTensorOp : public framework::OperatorBase { + public: + SplitLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &mask = scope.FindVar(Input("Mask"))->Get(); + auto *out_true = + scope.FindVar(Output("OutTrue"))->GetMutable(); + auto *out_false = + scope.FindVar(Output("OutFalse"))->GetMutable(); + auto level = static_cast(Attr("level")); + auto &x_lod = x.lod(); + auto &mask_dim = mask.dims(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + + std::unique_ptr cpu_mask{new framework::LoDTensor()}; + if (platform::is_cpu_place(mask.place())) { + cpu_mask->ShareDataWith(mask); + } else if (platform::is_gpu_place(mask.place())) { +#ifdef PADDLE_WITH_CUDA + framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); +#else + PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); +#endif + } + auto *mask_data = cpu_mask->data(); + + std::vector> copy_ranges(mask_dim[0]); + + // set out_true/out_false lod + for (size_t t = 0; t < 2; t++) { + LoD *lod = nullptr; + if (t == 0) { + lod = out_false->mutable_lod(); + } else { + lod = out_true->mutable_lod(); + } + lod->clear(); + for (size_t i = 0; i < static_cast(mask_dim[0]); i++) { + if (static_cast(mask_data[i]) == t) { + size_t start_idx = i; + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x_lod, start_idx, start_idx + 1, level); + + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); + } + } + } + + for (size_t t = 0; t < 2; ++t) { + framework::LoDTensor *out; + if (t == 0) { + out = out_false; + } else { + out = out_true; + } + auto &ranges = copy_ranges[t]; + size_t height = std::accumulate( + ranges.begin(), ranges.end(), 0UL, + [](size_t a, const CopyRange &b) { return a + b.end - b.begin; }); + auto x_dim = x.dims(); + x_dim[0] = static_cast(height); + out->Resize(x_dim); + out->mutable_data(x.place(), x.type()); + size_t offset = 0; + for (auto &each_range : ranges) { + size_t len = each_range.end - each_range.begin; + if (len == 0) { + continue; + } + // out[offset: offset+len] = x[each_range.begin: each_range.end] + auto slice = out->Slice(static_cast(offset), + static_cast(offset + len)); + framework::Copy(x.Slice(static_cast(each_range.begin), + static_cast(each_range.end)), + x.place(), dev_ctx, &slice); + offset += len; + } + } + } +}; + +class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input LoDTensor"); + AddInput("Mask", "A bool column vector which mask the input"); + AddOutput("OutTrue", "True branch of input LoDTensor"); + AddOutput("OutFalse", "False branch of input LoDTensor"); + AddAttr("level", "(int) the specific lod level to split.") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment( + R"DOC( + Split a LoDTensor with a Mask at certain level. The input LoDTensor + has 3 sequence at certain lod level. The Mask is a bool column vector, + such as [0, 1, 0] at the same level. The first and third sequence will + be send to False Output LoDTensor; whereas the second sequence will + be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC"); + } +}; + +class SplitLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "SplitLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("Mask"), + "SplitLoDTensorOp must has input Mask."); + PADDLE_ENFORCE(context->HasOutput("OutTrue"), + "SplitLoDTensorOp must has output OutTrue."); + PADDLE_ENFORCE(context->HasOutput("OutFalse"), + "SplitLoDTensorOp must has output OutFalse."); + + auto mask_dim = context->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ(mask_dim.size(), 2); + PADDLE_ENFORCE_EQ(mask_dim[1], 1); + + context->SetOutputDim("OutTrue", context->GetInputDim("X")); + context->SetOutputDim("OutFalse", context->GetInputDim("X")); + } +}; + +class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("merge_lod_tensor"); + grad_op->SetInput("InTrue", OutputGrad("OutTrue")); + grad_op->SetInput("InFalse", OutputGrad("OutFalse")); + grad_op->SetInput("Mask", Input("Mask")); + grad_op->SetInput("X", Input("X")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp, + ops::SplitLoDTensorOpProtoMaker, + ops::SplitLoDTensorInferShape, + ops::SplitLoDTensorArrayGradMaker); diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8bc22fe1d3d24866b5bf2506ed0ff585d259cc2 --- /dev/null +++ b/paddle/fluid/operators/split_op.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +class SplitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SplitOp should not be null."); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + "Outputs(Out) of SplitOp should not be empty."); + auto in_dims = ctx->GetInputDim("X"); + auto outs_names = ctx->Outputs("Out"); + size_t axis = static_cast(ctx->Attrs().Get("axis")); + size_t num = static_cast(ctx->Attrs().Get("num")); + std::vector sections = static_cast>( + ctx->Attrs().Get>("sections")); + const size_t outs_number = outs_names.size(); + std::vector outs_dims; + outs_dims.reserve(outs_number); + + if (num > 0) { + int64_t in_axis_dim = in_dims[axis]; + PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, + "tensor split does not result" + " in an equal division"); + size_t out_axis_dim = in_axis_dim / num; + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (sections.size() > 0) { + PADDLE_ENFORCE_EQ(sections.size(), outs_number, + "tensor split sections size" + "should be equal to output size."); + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = sections[i]; + outs_dims.push_back(dim); + } + } + ctx->SetOutputsDim("Out", outs_dims); + if (axis != 0) { + // Only pass LoD when not spliting along the first dim. + for (size_t i = 0; i < outs_number; ++i) { + ctx->ShareLoD("X", "Out", 0, i); + } + } + } +}; + +class SplitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor of the split operator."); + AddOutput("Out", "(Tensor) Output tensors of the split operator.") + .AsDuplicable(); + AddComment(R"DOC( +Split operator + +This operator splits the input tensor into multiple sub-tensors. + +Example: + Input = [[1,2], + [3,4], + [5,6]] + sections = [2,1] + axis = 0 + Output[0] = [[1,2], + [3,4]] + Output[1] = [[5,6]] + + )DOC"); + AddAttr>("sections", + "(vector) " + "the length of each output along the " + "specified axis.") + .SetDefault(std::vector{}); + AddAttr("num", + "(int, default 0)" + "Number of sub-tensors. This must evenly divide " + "Input.dims()[axis]") + .SetDefault(0); + AddAttr("axis", + "(int, default 0) " + "The axis which the input will be splited on.") + .SetDefault(0); + } +}; + +class SplitGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto op = new framework::OpDesc(); + op->SetType("concat"); + op->SetInput("X", OutputGrad("Out")); + op->SetOutput("Out", InputGrad("X")); + op->SetAttrMap(Attrs()); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +USE_CPU_ONLY_OP(concat); + +REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker); +REGISTER_OP_CPU_KERNEL(split, + ops::SplitOpKernel); diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..279691c759e988ea26a62e1263198d2a9d878cf9 --- /dev/null +++ b/paddle/fluid/operators/split_op.cu.cc @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_op.h" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + split, ops::SplitOpKernel); diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e78218f2fb108dac5bce717e03ce0aba1ed88195 --- /dev/null +++ b/paddle/fluid/operators/split_op.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +template +class SplitOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + auto in_stride = framework::stride(in->dims()); + int64_t axis = static_cast(ctx.Attr("axis")); + const size_t n = outs.size(); + size_t input_offset = 0; + for (size_t i = 0; i < n; i++) { + auto& out = outs[i]; + out->mutable_data(ctx.GetPlace()); + size_t axis_dim = out->dims()[axis]; + auto out_stride = framework::stride(out->dims()); + StridedMemcpy(ctx.device_context(), in->data() + input_offset, + in_stride, out->dims(), out_stride, out->data()); + input_offset += axis_dim * in_stride[axis]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..113ce2ce109778a355130aaf686261c1f71c0980 --- /dev/null +++ b/paddle/fluid/operators/split_selected_rows_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_selected_rows_op.h" + +namespace paddle { +namespace operators { + +class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input SelectedRows."); + AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable(); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + + AddComment(R"DOC( +Split a SelectedRows with a specified rows section. +height_sections is only needed when need to split the dims of the original tensor. + +Example: + Input: + X.rows = {7, 5} + X.height = 12 + Attr: + height_sections = {4, 8} + Out: + out0.rows = {} + out0.height = 4 + + out1.rows = {5, 7} + out2.height = 8 + +)DOC"); + } +}; + +class SplitSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), + "SplitSelectedRowsOp must has output Out."); + + std::vector height_sections = + ctx->Attrs().Get>("height_sections"); + int64_t n = ctx->Outputs("Out").size(); + + std::vector outs_dims; + outs_dims.reserve(n); + + // make output dims + for (int64_t i = 0; i < n; ++i) { + auto dims = ctx->GetInputDim("X"); + if (height_sections.size()) { + PADDLE_ENFORCE_EQ( + height_sections.size(), static_cast(n), + "The size of height section should be the same with height" + " section size."); + dims[0] = height_sections[i]; + } + outs_dims.push_back(dims); + } + ctx->SetOutputsDim("Out", outs_dims); + } +}; + +class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("sum"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp, + ops::SplitSelectedRowsOpMaker, + ops::SplitSelectedRowsGradMaker); +REGISTER_OP_CPU_KERNEL( + split_selected_rows, + ops::SplitSelectedRowsOpKernel); diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/operators/split_selected_rows_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0bbf1ecfaefeddd426b1055d93ce39a138abec28 --- /dev/null +++ b/paddle/fluid/operators/split_selected_rows_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_selected_rows_op.h" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + split_selected_rows, + ops::SplitSelectedRowsOpKernel); diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h new file mode 100644 index 0000000000000000000000000000000000000000..527264bd675520a98b442380e2d1ec259964e92e --- /dev/null +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +static int FindOutIdx(int row, const std::vector& height_sections) { + int offset = 0; + for (size_t i = 0; i < height_sections.size(); ++i) { + if (row >= offset && row < (offset + height_sections[i])) { + return i; + } + offset += height_sections[i]; + } + return -1; +} + +template +class SplitSelectedRowsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + auto height_sections = ctx.Attr>("height_sections"); + + auto x_rows = x->rows(); + std::vector> outs_rows_idx; + outs_rows_idx.resize(outs.size()); + + auto row_numel = x->value().numel() / x->value().dims()[0]; + auto src = x->value().data(); + + for (size_t i = 0; i < x_rows.size(); ++i) { + int out_idx = FindOutIdx(x_rows[i], height_sections); + outs_rows_idx[out_idx].push_back(i); + } + auto place = ctx.GetPlace(); + + for (size_t i = 0; i < outs_rows_idx.size(); ++i) { + auto rows_idx = outs_rows_idx[i]; + if (rows_idx.size() > 0) { + auto dims = x->GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_value()->mutable_data(dims, x->place()); + for (auto idx : rows_idx) { + outs[i]->mutable_rows()->push_back(x_rows[idx]); + } + auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); + for (size_t j = 0; j < rows_idx.size(); j++) { + if (platform::is_cpu_place(place)) { + memory::Copy(platform::CPUPlace(), dst + j * row_numel, + platform::CPUPlace(), src + rows_idx[j] * row_numel, + sizeof(T) * row_numel); + } else { +#ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), src + rows_idx[j] * row_numel, + sizeof(T) * row_numel, stream); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6755b12000463b111fe65dbdf2c140a060d968b --- /dev/null +++ b/paddle/fluid/operators/spp_op.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/spp_op.h" +namespace paddle { +namespace operators { + +class SppOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SppOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of spp operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of spp operator." + "N * M." + "M = C * H * W"); + AddAttr("pyramid_height", "(int), multi level pooling"); + AddAttr( + "pooling_type", + "(string), pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddComment(R"DOC( + "With spatial pyramid pooling, the input image can + be of any sizes. This not only allows arbitrary aspect + ratios, but also allows arbitrary scales. We can resize + the input image to any scale (e.g., min(w, h)=180, 224, + ...) and apply the same deep network. When the + input image is at different scales, the network (with + the same filter sizes) will extract features at different + scales. The scales play important roles in traditional + methods. + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Output shape: $(H_{out}, W_{out})$ + Where + $$ + H_{out} = N \\ + W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in} + $$ + paper https://arxiv.org/pdf/1406.4729v4.pdf + )DOC"); + } +}; + +class SppOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SppOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SppOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + int pyramid_height = ctx->Attrs().Get("pyramid_height"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Spping intput must be of 4-dimensional."); + int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1]; + std::vector output_shape({in_x_dims[0], outlen}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class SppOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad); +REGISTER_OP_CPU_KERNEL( + spp, ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CPU_KERNEL( + spp_grad, ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..cad2ca5ef8e16ee6ea6943f61da33367099e0937 --- /dev/null +++ b/paddle/fluid/operators/spp_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/spp_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + spp, ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CUDA_KERNEL( + spp_grad, ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1da1f805807fc648b3a54de91842f163b356435b --- /dev/null +++ b/paddle/fluid/operators/spp_op.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { +template +class SppKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + auto* out = context.Output("Out"); + int pyramid_height = context.template Attr("pyramid_height"); + std::string pooling_type = + context.template Attr("pooling_type"); + out->mutable_data(context.GetPlace()); + auto out_stride = framework::stride(out->dims()); + int input_h = in_x->dims()[2]; + int input_w = in_x->dims()[3]; + size_t output_offset = 0; + for (int p = 0; p < pyramid_height; ++p) { + int bins = std::pow(2, p); + int kernel_size_h = std::ceil(input_h / static_cast(bins)); + int kernel_size_w = std::ceil(input_w / static_cast(bins)); + int padding_h = (kernel_size_h * bins - input_h + 1) / 2; + int padding_w = (kernel_size_w * bins - input_w + 1) / 2; + std::vector kernel_size({kernel_size_h, kernel_size_w}); + std::vector strides({kernel_size_h, kernel_size_w}); + std::vector paddings({padding_h, padding_w}); + // pooling output shape + framework::Tensor out_level; + std::vector output_shape_vec( + {in_x->dims()[0], in_x->dims()[1], bins, bins}); + framework::DDim output_shape(framework::make_ddim(output_shape_vec)); + out_level.mutable_data(output_shape, context.GetPlace()); + // pooling + if (pooling_type == "max") { + math::Pool2dFunctor, T> pool_forward; + math::MaxPool max_process; + pool_forward(context.template device_context(), *in_x, + kernel_size, strides, paddings, max_process, &out_level); + } else if (pooling_type == "avg") { + math::Pool2dFunctor, T> pool_forward; + math::AvgPool avg_process; + pool_forward(context.template device_context(), *in_x, + kernel_size, strides, paddings, avg_process, &out_level); + } + // flatten pooling output shape + int output_flatten_w = in_x->dims()[1] * bins * bins; + std::vector output_flatten_shape_vec( + {in_x->dims()[0], output_flatten_w}); + framework::DDim output_flatten_shape( + framework::make_ddim(output_flatten_shape_vec)); + out_level.Resize(output_flatten_shape); + // concat + auto out_level_stride = framework::stride(out_level.dims()); + StridedMemcpy(context.template device_context(), + out_level.data(), out_level_stride, out_level.dims(), + out_stride, out->data() + output_offset); + output_offset += out_level.dims()[1] * out_level_stride[1]; + } + } +}; +template +class SppGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* out = context.Input("Out"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + int pyramid_height = context.template Attr("pyramid_height"); + std::string pooling_type = + context.template Attr("pooling_type"); + auto& device_ctx = context.template device_context(); + math::SetConstant zero; + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0)); + auto out_stride = framework::stride(out->dims()); + int input_h = in_x->dims()[2]; + int input_w = in_x->dims()[3]; + size_t out_offset = 0; + for (int p = 0; p < pyramid_height; ++p) { + int bins = std::pow(2, p); + int kernel_size_h = std::ceil(input_h / static_cast(bins)); + int kernel_size_w = std::ceil(input_w / static_cast(bins)); + int padding_h = (kernel_size_h * bins - input_h + 1) / 2; + int padding_w = (kernel_size_w * bins - input_w + 1) / 2; + std::vector kernel_size({kernel_size_h, kernel_size_w}); + std::vector strides({kernel_size_h, kernel_size_w}); + std::vector paddings({padding_h, padding_w}); + // split out and outgrad ... to flatten + framework::Tensor out_level; + framework::Tensor outgrad_level; + int out_flatten_w = in_x->dims()[1] * bins * bins; + std::vector out_flatten_shape_vec( + {in_x->dims()[0], out_flatten_w}); + framework::DDim out_flatten_shape( + framework::make_ddim(out_flatten_shape_vec)); + out_level.mutable_data(out_flatten_shape, context.GetPlace()); + outgrad_level.mutable_data(out_flatten_shape, context.GetPlace()); + auto flatten_stride = framework::stride(out_level.dims()); + // memcpy + StridedMemcpy(context.template device_context(), + out->data() + out_offset, out_stride, + out_level.dims(), flatten_stride, out_level.data()); + + StridedMemcpy(context.template device_context(), + out_grad->data() + out_offset, out_stride, + outgrad_level.dims(), flatten_stride, + outgrad_level.data()); + out_offset += out_level.dims()[1] * out_stride[1]; + // flatten backward to nchw + + std::vector out_shape_vec({in_x->dims()[0], in_x->dims()[1]}); + out_shape_vec.push_back( + (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1); + out_shape_vec.push_back( + (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out_level.ShareDataWith(out_level); + out_level.Resize(out_shape); + outgrad_level.ShareDataWith(outgrad_level); + outgrad_level.Resize(out_shape); + // pooling backward + if (pooling_type == "max") { + math::MaxPool2dGradFunctor pool2d_backward; + pool2d_backward(context.template device_context(), *in_x, + *&out_level, *&outgrad_level, kernel_size, strides, + paddings, in_x_grad); + } else if (pooling_type == "avg") { + math::Pool2dGradFunctor, T> + pool_backward; + math::AvgPoolGrad avg_process; + pool_backward(context.template device_context(), *in_x, + *&out_level, *&outgrad_level, kernel_size, strides, + paddings, avg_process, in_x_grad); + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c1d0c2c7f392cae3dc30611a0f077c1af7b68cbe --- /dev/null +++ b/paddle/fluid/operators/squared_l2_distance_op.cc @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/squared_l2_distance_op.h" + +namespace paddle { +namespace operators { + +class SquaredL2DistanceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SquaredL2DistanceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of SquaredL2DistanceOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("sub_result"), + "Output(sub_result) of SquaredL2DistanceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SquaredL2DistanceOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims), + "Tensor rank of both SquaredL2DistanceOp's " + "inputs must be same."); + + int rank = framework::arity(x_dims); + PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2."); + PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0], + "Product of dimensions expcet the first dimension of " + "input and target must be equal."); + PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0], + "First dimension of target must be equal to input " + "or to 1."); + + ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]}); + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input of SquaredL2DistanceOp."); + AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp."); + AddOutput("sub_result", + "(Tensor) Buffering subtraction result which " + "will be reused in backward.") + .AsIntermediate(); + AddOutput("Out", "(Tensor) Squared l2 distance between input and target."); + AddComment(R"DOC( +SquaredL2Distance operator + +This operator will cacluate the squared L2 distance for the input and +the target. Number of distance value will be equal to the first dimension +of input. First dimension of the target could be equal to the input or to 1. +If the first dimension of target is 1, the operator will broadcast target's +first dimension to input's first dimension. During backward propagation, +the user can decide whether to calculate the gradient of the input or +the target or both. + +Both the input X and Y can carry the LoD (Level of Details) information. +However, the output only shares the LoD information with input X. + )DOC"); + } +}; + +class SquaredL2DistanceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of Out should not be null"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0], + "First dimension of output gradient and " + "input value must be equal."); + PADDLE_ENFORCE_EQ(out_dims[1], 1, + "Second dimension of output gradient " + "must be 1."); + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims); + if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp, + ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad, + ops::SquaredL2DistanceGradOp); +REGISTER_OP_CPU_KERNEL( + squared_l2_distance, + ops::SquaredL2DistanceKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..959e7afac99bd2565890cb6a296bc908f250a16c --- /dev/null +++ b/paddle/fluid/operators/squared_l2_distance_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/squared_l2_distance_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + squared_l2_distance, + ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aab241247e5e92f43a997f3d29c8e7d7d44d7711 --- /dev/null +++ b/paddle/fluid/operators/squared_l2_distance_op.h @@ -0,0 +1,125 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +class SquaredL2DistanceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* out0 = context.Output("sub_result"); + auto* out1 = context.Output("Out"); + + auto in0_dims = in0->dims(); + auto in1_dims = in1->dims(); + + int cols = in0->numel() / in0_dims[0]; + // reduce dimensions except the first + auto x = + EigenMatrix::From(*in0, framework::make_ddim({in0_dims[0], cols})); + auto y = + EigenMatrix::From(*in1, framework::make_ddim({in1_dims[0], cols})); + + out0->mutable_data(context.GetPlace()); + out1->mutable_data(context.GetPlace()); + auto sub_result = EigenMatrix::From(*out0); + auto z = EigenVector::Flatten(*out1); + + auto& place = + *context.template device_context().eigen_device(); + auto x_dims = x.dimensions(); + auto y_dims = y.dimensions(); + // buffer the substraction result + if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) { + sub_result.device(place) = + x - + y.broadcast(Eigen::array({{static_cast(x_dims[0]), 1}})); + } else { + sub_result.device(place) = x - y; + } + auto sub_res_pow2 = sub_result * sub_result; + z.device(place) = sub_res_pow2.sum(Eigen::array({{1}})); + } +}; + +template +class SquaredL2DistanceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("sub_result"); + auto* in1 = context.Input(framework::GradVarName("Out")); + auto* x_g = context.Output(framework::GradVarName("X")); + auto* y_g = context.Output(framework::GradVarName("Y")); + + auto sub_result = EigenMatrix::From(*in0); + auto out_grad = EigenMatrix::From(*in1); + + auto x_dims = x_g->dims(); + auto y_dims = y_g->dims(); + + int cols = x_g->numel() / x_dims[0]; + // calculate gradient + auto grad_mat = 2 * + (out_grad.broadcast(Eigen::array({{1, cols}}))) * + sub_result; + + // propagate back to input + auto& eigen_place = + *context.template device_context().eigen_device(); + if (x_g) { + x_g->mutable_data(context.GetPlace()); + // eigen matrix + auto x_grad = + EigenMatrix::From(*x_g, framework::make_ddim({x_dims[0], cols})); + // dimensions are same with subResult + x_grad.device(eigen_place) = grad_mat; + } + + if (y_g) { + y_g->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0], + "First dimension of gradient must be greater or " + "equal than first dimension of target."); + + if (sub_result.dimensions()[0] == y_dims[0]) { + auto y_grad = + EigenMatrix::From(*y_g, framework::make_ddim({y_dims[0], cols})); + y_grad.device(eigen_place) = -1 * grad_mat; + } else { + auto col_sum_res = -1 * (grad_mat.sum(Eigen::array({{0}}))); + auto y_grad = EigenVector::Flatten(*y_g); + y_grad.device(eigen_place) = col_sum_res; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a43cc22994b1b15a4acf0fd89b956bf05d3f35c8 --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/squared_l2_norm_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SquaredL2NormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + ctx->SetOutputDim("Out", {1}); + } +}; + +class SquaredL2NormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input of squared_l2_norm op."); + AddOutput("Out", "(Scalar) The output of squared_l2_norm op."); + AddComment(R"DOC( +SquaredL2Norm Operator. + +Computes the squared L2 norm of a tensor. + +$$Out = \sum_{i} X_{i}^2$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker, + squared_l2_norm_grad, ops::SquaredL2NormGradOp); +REGISTER_OP_CPU_KERNEL( + squared_l2_norm, + ops::SquaredL2NormKernel); +REGISTER_OP_CPU_KERNEL( + squared_l2_norm_grad, + ops::SquaredL2NormGradKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..52f4ab79b2189b269b7d0685dccedc52b627ad8d --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/squared_l2_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + squared_l2_norm, + ops::SquaredL2NormKernel); +REGISTER_OP_CUDA_KERNEL( + squared_l2_norm_grad, + ops::SquaredL2NormGradKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op.h b/paddle/fluid/operators/squared_l2_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..56524636b8f1b063266fda0997e91a703131adff --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// Out = sum(square(X)) +template +class SquaredL2NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + framework::Tensor *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto out = framework::EigenScalar::From(*Out); + auto *place = + context.template device_context().eigen_device(); + + out.device(*place) = x.square().sum(); + } +}; + +// dX = X +template +class SquaredL2NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + const framework::Tensor *dOut = + context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(dOut->numel() == 1, + "Squared L2 Norm Gradient should be scalar"); + framework::Tensor *dX = + context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto dout = framework::EigenVector::Flatten(*dOut); + auto dx = framework::EigenVector::Flatten(*dX); + auto *place = + context.template device_context().eigen_device(); + + Eigen::DSizes x_dsize(X->numel()); + dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h new file mode 100644 index 0000000000000000000000000000000000000000..8a99b405e266da48427fa23e9a3e67f2bc54c5a0 --- /dev/null +++ b/paddle/fluid/operators/strided_memcpy.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/detail/strided_memcpy.h" + +namespace paddle { +namespace operators { + +// Strided memory copy from src to dst. +// +// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will +// be a segment fault. +// +// The stride of an array (also referred to as increment, pitch or step size) is +// the number of locations in memory between beginnings of successive array +// elements +// +// For example, for tensor like [1, 3, 300, 300]. If there is no padding, the +// stride is [270000, 90000, 300, 1]. +// +// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke +// `dev_ctx.Wait()`. +template +inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, + const framework::DDim& src_stride, + const framework::DDim& dst_dim, + const framework::DDim& dst_stride, T* dst) { + using namespace detail; + StridedCopyDimVisitor func(dev_ctx, src, src_stride, dst_stride, dst); + boost::apply_visitor(func, dst_dim); +} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a369941a993c02744cb7de0d6c6c878f56d5e0fe --- /dev/null +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/strided_memcpy.h" +#include "gtest/gtest.h" +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace operators { + +TEST(StridedMemcpy, CPUCrop) { + // clang-format off + int src[] = { + 0, 1, 2, 0, 0, + 0, 3, 4, 0, 0, + 0, 0, 0, 0, 0, + }; + // clang-format on + + framework::DDim src_stride({5, 1}); + + int dst[4]; + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({2, 1}); + + platform::CPUDeviceContext ctx; + StridedMemcpy(ctx, src + 1, src_stride, dst_dim, dst_stride, dst); + + ASSERT_EQ(1, dst[0]); + ASSERT_EQ(2, dst[1]); + ASSERT_EQ(3, dst[2]); + ASSERT_EQ(4, dst[3]); +} + +TEST(StridedMemcpy, CPUConcat) { + // clang-format off + int src[] = { + 1, 2, + 3, 4 + }; + // clang-format on + + int dst[8]; + + framework::DDim src_stride({2, 1}); + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({4, 1}); + platform::CPUDeviceContext ctx; + + StridedMemcpy(ctx, src, src_stride, dst_dim, dst_stride, dst); + StridedMemcpy(ctx, src, src_stride, dst_dim, dst_stride, dst + 2); + + // clang-format off + int expect_dst[] = { + 1, 2, 1, 2, + 3, 4, 3, 4 + }; + // clang-format on + for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) { + ASSERT_EQ(expect_dst[i], dst[i]); + } +} + +#ifdef PADDLE_WITH_CUDA +TEST(StridedMemcpy, GPUCrop) { + // clang-format off + int src[] = { + 0, 1, 2, 0, 0, + 0, 3, 4, 0, 0, + 0, 0, 0, 0, 0, + }; + // clang-format on + + platform::CUDAPlace gpu0(0); + platform::CPUPlace cpu; + + platform::CUDADeviceContext ctx(gpu0); + + int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); + + framework::DDim src_stride({5, 1}); + + int dst[4]; + int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); + + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({2, 1}); + + StridedMemcpy(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, + gpu_dst); + + memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); + ctx.Wait(); + + ASSERT_EQ(1, dst[0]); + ASSERT_EQ(2, dst[1]); + ASSERT_EQ(3, dst[2]); + ASSERT_EQ(4, dst[3]); + + memory::Free(gpu0, gpu_dst); + memory::Free(gpu0, gpu_src); +} + +TEST(StridedMemcpy, GPUConcat) { + // clang-format off + int src[] = { + 1, 2, + 3, 4 + }; + // clang-format on + + platform::CUDAPlace gpu0(0); + platform::CPUPlace cpu; + platform::CUDADeviceContext ctx(gpu0); + + int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); + + int dst[8]; + int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); + + framework::DDim src_stride({2, 1}); + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({4, 1}); + + StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); + StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, + gpu_dst + 2); + + memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); + ctx.Wait(); + + // clang-format off + int expect_dst[] = { + 1, 2, 1, 2, + 3, 4, 3, 4 + }; + // clang-format on + for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) { + ASSERT_EQ(expect_dst[i], dst[i]); + } + + memory::Free(gpu0, gpu_dst); + memory::Free(gpu0, gpu_src); +} + +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..96f851720aea2ddd643e1e6251fee314e26cbf95 --- /dev/null +++ b/paddle/fluid/operators/sum_op.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sum_op.h" +#include +#include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +class SumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null"); + + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SumOp should not be null."); + if (ctx->IsRuntime() && + ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarDesc::LOD_TENSOR_ARRAY) { + return; // skip runtime infershape when is tensor array; + } + + auto x_dims = ctx->GetInputsDim("X"); + size_t N = x_dims.size(); + PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); + + framework::DDim in_dim({0}); + for (auto& x_dim : x_dims) { + if (framework::product(x_dim) == 0) { + continue; + } + if (framework::product(in_dim) == 0) { + in_dim = x_dim; + } else { + PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape"); + } + } + ctx->SetOutputDim("Out", in_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto x_vars = ctx.MultiInputVar("X"); + if (x_vars[0]->IsType()) { + int dtype = -1; + for (auto& x_var : x_vars) { + auto& lod_tensor = x_var->Get(); + if (lod_tensor.numel() == 0) { + continue; + } + if (dtype == -1) { + dtype = framework::ToDataType(lod_tensor.type()); + } else { + PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type())); + } + } + PADDLE_ENFORCE_NE(dtype, -1, + "Sum operator should have at least one tensor"); + + return framework::OpKernelType( + static_cast(dtype), ctx.device_context()); + } else if (x_vars[0]->IsType()) { + return framework::OpKernelType( + framework::ToDataType( + x_vars[0]->Get().value().type()), + ctx.device_context()); + } else if (x_vars[0]->IsType()) { + for (auto& x_var : x_vars) { + auto& array = x_var->Get(); + for (auto& each : array) { + if (each.numel() != 0) { + return framework::OpKernelType(framework::ToDataType(each.type()), + ctx.device_context()); + } + } + } + PADDLE_THROW("Cannot find the input data type by all input data"); + } + PADDLE_THROW("Unexpected branch. Input type is %s", + x_vars[0]->Type().name()); + } +}; + +class SumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SumOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(vector) The input tensors of sum operator.") + .AsDuplicable(); + AddOutput("Out", "(Tensor) The output tensor of sum operator."); + AddComment(R"DOC( +Sum operator. + +This operators sums the input tensors. All the inputs can carry the +LoD (Level of Details) information. However, the output only shares +the LoD information with the first input. +)DOC"); + } +}; + +class SumOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto var_type = framework::proto::VarDesc::SELECTED_ROWS; + + for (auto& name : op_desc.Input("X")) { + VLOG(10) << name << " " + << block->FindRecursiveOrCreateVar(name).GetType(); + } + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [block](const std::string& name) { + return block->FindRecursiveOrCreateVar(name).GetType() == + framework::proto::VarDesc::LOD_TENSOR; + }); + + auto is_tensor_array = [block](const std::string& name) { + return block->FindRecursiveOrCreateVar(name).GetType() == + framework::proto::VarDesc::LOD_TENSOR_ARRAY; + }; + + bool any_input_is_tensor_array = + std::any_of(inputs.begin(), inputs.end(), is_tensor_array); + bool all_inputs_are_tensor_array = + std::all_of(inputs.begin(), inputs.end(), is_tensor_array); + + if (any_input_is_tensor_array) { + if (!all_inputs_are_tensor_array) { + std::ostringstream os; + for (auto& each : inputs) { + os << " " << each << " type is " + << block->FindRecursiveOrCreateVar(each).GetType() << "\n"; + } + PADDLE_ENFORCE(all_inputs_are_tensor_array, + "Not all inputs are tensor array:\n%s", os.str()); + } + var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY; + } else if (any_input_is_lod_tensor) { + var_type = framework::proto::VarDesc::LOD_TENSOR; + } + + auto out_var_name = op_desc.Output("Out").front(); + auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); + out_var.SetType(var_type); + auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front())); + out_var.SetDataType(in_var.GetDataType()); + } +}; + +class SumGradMaker : public framework::GradOpDescMakerBase { + public: + using framework::GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + auto x_grads = InputGrad("X", false); + std::vector> grad_ops; + grad_ops.reserve(x_grads.size()); + auto og = OutputGrad("Out"); + std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops), + [&og](const std::string& x_grad) { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("scale"); + grad_op->SetInput("X", og); + grad_op->SetOutput("Out", {x_grad}); + grad_op->SetAttr("scale", 1.0f); + return std::unique_ptr(grad_op); + }); + return grad_ops; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, + ops::SumOpVarTypeInference); +REGISTER_OP_CPU_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8d8f90d7510bae854a0507adaf8998fb7aea3b58 --- /dev/null +++ b/paddle/fluid/operators/sum_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sum_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5e1222c6ef723a6321392a5af7fdb558c24df32b --- /dev/null +++ b/paddle/fluid/operators/sum_op.h @@ -0,0 +1,158 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; +using LoDTensor = framework::LoDTensor; +template +using EigenVector = framework::EigenVector; + +template +class SumKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto in_vars = context.MultiInputVar("X"); + int N = in_vars.size(); + auto out_var = context.OutputVar("Out"); + + bool in_place = out_var == in_vars[0]; + + if (out_var->IsType()) { + auto *out = context.Output("Out"); + if (!in_place) { + out->mutable_data(context.GetPlace()); + } + auto result = EigenVector::Flatten(*out); + if (!in_place) { + math::SetConstant constant_functor; + constant_functor(context.template device_context(), out, + 0.0); + } + + math::SelectedRowsAddToTensor functor; + auto &place = + *context.template device_context().eigen_device(); + // If in_place, just skip the first tensor + for (int i = in_place ? 1 : 0; i < N; i++) { + if (in_vars[i]->IsType()) { + auto &in_t = in_vars[i]->Get(); + if (in_t.numel() == 0) { + continue; + } + auto in = EigenVector::Flatten(in_t); + result.device(place) = result + in; + } else if (in_vars[i]->IsType()) { + auto &in_t = in_vars[i]->Get(); + functor(context.template device_context(), in_t, out); + } else { + PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); + } + } + } else if (out_var->IsType()) { + std::unique_ptr in0; + if (in_place) { + // If is in_place, we store the input[0] to in0 + auto &in_sel0 = in_vars[0]->Get(); + auto &rows = in_sel0.rows(); +#ifdef PADDLE_WITH_CUDA + std::vector rows_in_cpu; + rows_in_cpu.reserve(rows.size()); + for (auto item : rows) { + rows_in_cpu.push_back(item); + } + in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height())); +#else + in0.reset(new framework::SelectedRows(rows, in_sel0.height())); +#endif + in0->mutable_value()->ShareDataWith(in_sel0.value()); + } + + auto get_selected_row = [&](size_t i) -> const SelectedRows & { + if (i == 0 && in0) { + return *in0.get(); + } else { + return in_vars[i]->Get(); + } + }; + + auto *out = context.Output("Out"); + out->mutable_rows()->clear(); + auto *out_value = out->mutable_value(); + + // Runtime InferShape + size_t first_dim = 0; + for (int i = 0; i < N; i++) { + auto &sel_row = get_selected_row(i); + first_dim += sel_row.rows().size(); + } + auto in_dim = + framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim[0] = static_cast(first_dim); + + out_value->Resize(framework::make_ddim(in_dim)); + out_value->mutable_data(context.GetPlace()); + + math::SelectedRowsAddTo functor; + + int64_t offset = 0; + for (int i = 0; i < N; i++) { + auto &sel_row = get_selected_row(i); + + PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); + functor(context.template device_context(), sel_row, + offset, out); + offset += sel_row.value().numel(); + } + } else if (out_var->IsType()) { + auto &out_array = *out_var->GetMutable(); + for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { + PADDLE_ENFORCE(in_vars[i]->IsType(), + "Only support all inputs are TensorArray"); + auto &in_array = in_vars[i]->Get(); + + for (size_t i = 0; i < in_array.size(); ++i) { + if (in_array[i].numel() != 0) { + if (i >= out_array.size()) { + out_array.resize(i + 1); + } + if (out_array[i].numel() == 0) { + framework::Copy(in_array[i], in_array[i].place(), + context.device_context(), &out_array[i]); + out_array[i].set_lod(in_array[i].lod()); + } else { + PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); + auto in = EigenVector::Flatten(in_array[i]); + auto result = EigenVector::Flatten(out_array[i]); + result.device(*context.template device_context() + .eigen_device()) = result + in; + } + } + } + } + } else { + PADDLE_THROW("Unexpected branch, output variable type is %s", + out_var->Type().name()); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f1b7252318f8321ef394d321aedb658398e16d --- /dev/null +++ b/paddle/fluid/operators/target_assign_op.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/target_assign_op.h" + +namespace paddle { +namespace operators { + +class TargetAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // checkout inputs + PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"), + "Input(EncodedGTBBox) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"), + "Input(GTScoreLabel) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("NegIndices"), + "Input(NegIndices) of TargetAssignOp should not be null"); + + // checkout outputs + PADDLE_ENFORCE( + ctx->HasOutput("PredBBoxLabel"), + "Output(PredBBoxLabel) of TargetAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PredBBoxWeight"), + "Output(PredBBoxWeight) of TargetAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PredScoreLabel"), + "Output(PredScoreLabel) of TargetAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PredScoreWeight"), + "Output(PredScoreWeight) of TargetAssignOp should not be null."); + + auto blabel_dims = ctx->GetInputDim("EncodedGTBBox"); + auto slabel_dims = ctx->GetInputDim("GTScoreLabel"); + auto mi_dims = ctx->GetInputDim("MatchIndices"); + auto neg_dims = ctx->GetInputDim("NegIndices"); + + PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL, + "The rank of Input(EncodedGTBBox) must be 3."); + PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL, + "The rank of Input(GTScoreLabel) must be 2."); + PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL, + "The rank of Input(MatchIndices) must be 2."); + PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL, + "The rank of Input(NegIndices) must be 2."); + + PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0], + "The 1st dimension (means the total number of " + "ground-truth bounding boxes) of Input(EncodedGTBBox) " + "and Input(GTScoreLabel) must be the same."); + PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1], + "The 2nd dimension (means the number of priod boxes) " + "of Input(EncodedGTBBox) and " + "Input(MatchIndices) must be the same."); + PADDLE_ENFORCE_EQ(blabel_dims[2], 4, + "The 3rd dimension of Input(EncodedGTBBox) must be 4."); + + auto n = mi_dims[0]; + auto np = mi_dims[1]; + ctx->SetOutputDim("PredBBoxLabel", {n, np, 4}); + ctx->SetOutputDim("PredBBoxWeight", {n, np, 1}); + ctx->SetOutputDim("PredScoreLabel", {n, np, 1}); + ctx->SetOutputDim("PredScoreWeight", {n, np, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("EncodedGTBBox")->type()), + ctx.device_context()); + } +}; + +class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("EncodedGTBBox", + "(LoDTensor), The encoded ground-truth bounding boxes with shape " + "[Ng, Np, 4], where Ng is the total number of ground-truth boxes " + "in this mini-batch, Np the number of predictions, 4 is the " + "number of coordinate in [xmin, ymin, xmax, ymax] layout."); + AddInput("GTScoreLabel", + "(LoDTensor, default LoDTensor), The input ground-truth " + "labels with shape [Ng, 1], where the Ng is the same as it in " + "the input of EncodedGTBBox."); + AddInput("MatchIndices", + "(Tensor, default Tensor), The input matched indices " + "with shape [N, Np], where N is the batch size, Np is the same " + "as it in the input of EncodedGTBBox. If MatchIndices[i][j] " + "is -1, the j-th prior box is not matched to any ground-truh " + "box in i-th instance."); + AddInput("NegIndices", + "(LoDTensor, default LoDTensor), The input negative example " + "indices with shape [Neg, 1], where is the total number of " + "negative example indices."); + AddAttr("background_label", + "(int, default 0), Label index of background class.") + .SetDefault(0); + AddOutput("PredBBoxLabel", + "(Tensor), The output encoded ground-truth labels " + "with shape [N, Np, 4], N is the batch size and Np, 4 is the " + "same as they in input of EncodedGTBBox. If MatchIndices[i][j] " + "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth " + "box for background_label in i-th instance."); + AddOutput("PredBBoxWeight", + "(Tensor), The weight for PredBBoxLabel with the shape " + "of [N, Np, 1]"); + AddOutput("PredScoreLabel", + "(Tensor, default Tensor), The output score labels for " + "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] " + "is -1, PredScoreLabel[i][j] = background_label."); + AddOutput("PredScoreWeight", + "(Tensor), The weight for PredScoreLabel with the shape " + "of [N, Np, 1]"); + AddComment(R"DOC( +This operator is, for given the encoded boxes between prior boxes and +ground-truth boxes and ground-truth class labels, to assign classification +and regression targets to each prior box as well as weights to each +prior box. The weights is used to specify which prior box would not contribute +to training loss. + +For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`, +`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`. +Assumed that the row offset for each instance in `EncodedGTBBox` is called lod, +this operato assigns classification/regression targets by performing the +following steps: + +1. Assigning all outpts based on `MatchIndices`: + +If id = MatchIndices[i][j] > 0, + + PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j] + PredBBoxWeight[i][j] = 1. + PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id] + PredScoreWeight[i][j] = 1. + +Otherwise, + + PredBBoxLabel[j][j] = [0., 0., 0., 0.] + PredBBoxWeight[i][j] = 0. + PredScoreLabel[i][j] = background_label + PredScoreWeight[i][j] = 0. + +2. Assigning PredScoreWeight based on `NegIndices`: + +Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod, +for i-th instance and all ids of NegIndices in this instance: + + PredScoreLabel[i][id] = background_label + PredScoreWeight[i][id] = 1.0 + + )DOC"); + } +}; + +template +struct NegTargetAssignFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices, + const size_t* lod, const int num, const int num_prior_box, + const int background_label, int* out_label, T* out_label_wt) { + for (int i = 0; i < num; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { + int id = neg_indices[j]; + out_label[i * num_prior_box + id] = background_label; + out_label_wt[i * num_prior_box + id] = static_cast(1.0); + } + } + } +}; + +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp, + ops::TargetAssignOpMaker); +REGISTER_OP_CPU_KERNEL( + target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/target_assign_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c012d27ad82eb62d9981c8c73ef5b8cc03adc47 --- /dev/null +++ b/paddle/fluid/operators/target_assign_op.cu @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/target_assign_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod, + const int num, const int num_prior_box, + const int background_label, + int* out_label, T* out_label_wt) { + int bidx = blockIdx.x; + int st = lod[bidx]; + int ed = lod[bidx + 1]; + + int row_start = bidx * num_prior_box; + for (int i = st + threadIdx.x; i < ed; i += blockDim.x) { + int id = row_start + neg_indices[i]; + out_label[id] = background_label; + out_label_wt[id] = 1.; + } +} + +template +struct NegTargetAssignFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const int* neg_indices, const size_t* lod, const int num, + const int num_prior_box, const int background_label, + int* out_label, T* out_label_wt) { + const int block_size = 256; + const int grid_size = num; + NegTargetAssignKernel<<>>( + neg_indices, lod, num, num_prior_box, background_label, out_label, + out_label_wt); + } +}; + +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/target_assign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..876111523af51ce67e804d6646f404c45c00af12 --- /dev/null +++ b/paddle/fluid/operators/target_assign_op.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct TargetAssignFunctor { + const T* gt_box_; + const int* gt_label_; + const int* match_indices_; + const size_t* lod_; + const int background_label_; + const int64_t num_; + const int64_t num_prior_box_; + + T* out_box_; + T* out_box_wt_; + int* out_label_; + T* out_label_wt_; + + TargetAssignFunctor(const T* gt_box, const int* gt_label, + const int* match_indices, const size_t* lod, + const int background_label, const int64_t num, + const int64_t np, T* out_box, T* out_box_wt, + int* out_label, T* out_label_wt) + : gt_box_(gt_box), + gt_label_(gt_label), + match_indices_(match_indices), + lod_(lod), + background_label_(background_label), + num_(num), + num_prior_box_(np), + out_box_(out_box), + out_box_wt_(out_box_wt), + out_label_(out_label), + out_label_wt_(out_label_wt) {} + + HOSTDEVICE void operator()(size_t i) const { + int row = i / num_prior_box_; + int col = i - row * num_prior_box_; + + size_t row_off = lod_[row]; + int offset = row * num_prior_box_ + col; + + int id = match_indices_[offset]; + T* obox = out_box_ + offset * 4; + int* olabel = out_label_ + offset; + T* obox_wt = out_box_wt_ + offset; + T* olabel_wt = out_label_wt_ + offset; + + if (id > -1) { + const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4; + + obox[0] = gtbox[0]; + obox[1] = gtbox[1]; + obox[2] = gtbox[2]; + obox[3] = gtbox[3]; + + olabel[0] = gt_label_[row_off + id]; + obox_wt[0] = static_cast(1.); + olabel_wt[0] = static_cast(1.); + } else { + obox[0] = static_cast(0.); + obox[1] = static_cast(0.); + obox[2] = static_cast(0.); + obox[3] = static_cast(0.); + + olabel[0] = background_label_; + obox_wt[0] = static_cast(0.); + olabel_wt[0] = static_cast(0.); + } + } +}; + +template +struct NegTargetAssignFunctor { + void operator()(const platform::DeviceContext& ctx, const int* neg_indices, + const size_t* lod, const int num, const int num_prior_box, + const int background_label, int* out_label, + T* out_label_wt) const; +}; + +template +class TargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* enc_gt_box = ctx.Input("EncodedGTBBox"); + auto* gt_label = ctx.Input("GTScoreLabel"); + auto* match_indices = ctx.Input("MatchIndices"); + auto* neg_indices = ctx.Input("NegIndices"); + + auto* out_box = ctx.Output("PredBBoxLabel"); + auto* out_box_wt = ctx.Output("PredBBoxWeight"); + auto* out_label = ctx.Output("PredScoreLabel"); + auto* out_label_wt = ctx.Output("PredScoreWeight"); + + PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL); + PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL); + PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL); + + int background_label = ctx.Attr("background_label"); + + const T* box_data = enc_gt_box->data(); + const int* label_data = gt_label->data(); + const int* match_idx_data = match_indices->data(); + const int* neg_idx_data = neg_indices->data(); + + T* obox_data = out_box->mutable_data(ctx.GetPlace()); + T* obox_wt_data = out_box_wt->mutable_data(ctx.GetPlace()); + int* olabel_data = out_label->mutable_data(ctx.GetPlace()); + T* olabel_wt_data = out_label_wt->mutable_data(ctx.GetPlace()); + + int64_t num = match_indices->dims()[0]; + int64_t num_prior_box = match_indices->dims()[1]; + + auto gt_lod = enc_gt_box->lod().back(); + auto gt_label_lod = gt_label->lod().back(); + auto neg_lod = neg_indices->lod().back(); + for (size_t i = 0; i < gt_lod.size(); ++i) { + PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]); + } + + size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace()); + size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); + + TargetAssignFunctor functor(box_data, label_data, match_idx_data, + gt_lod_data, background_label, num, + num_prior_box, obox_data, obox_wt_data, + olabel_data, olabel_wt_data); + + auto& device_ctx = ctx.template device_context(); + platform::ForRange for_range(device_ctx, + num * num_prior_box); + for_range(functor); + + NegTargetAssignFunctor neg_trg_functor; + neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box, + background_label, olabel_data, olabel_wt_data); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..50811fb22491598849216f41a584ae0b68f8f306 --- /dev/null +++ b/paddle/fluid/operators/tensor_array_read_write_op.cc @@ -0,0 +1,220 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/array_operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +namespace paddle { +namespace operators { + +class WriteToArrayOp : public ArrayOp { + public: + WriteToArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + if (x == nullptr) return; + auto &x_tensor = x->Get(); + size_t offset = GetOffset(scope, place); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + if (offset >= out->size()) { + VLOG(10) << "Resize " << Output("Out") << " from " << out->size() + << " to " << offset + 1; + out->resize(offset + 1); + } + if (x_tensor.memory_size() > 0) { + auto *out_tensor = &out->at(offset); + + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + Copy(x_tensor, place, dev_ctx, out_tensor); + out_tensor->set_lod(x_tensor.lod()); + } else { + VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << offset << "]."; + } + } +}; + +class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) the tensor will be written to tensor array"); + AddInput( + "I", + "(Tensor) the subscript index in tensor array. The number of element " + "should be 1"); + AddOutput("Out", "(TensorArray) the tensor array will be written"); + AddComment(R"DOC( +WriteToArray Operator. + +This operator writes a LoDTensor to a LoDTensor array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The +equation is + +$$A[i] = T$$ + +)DOC"); + } +}; + +class WriteToArrayInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index"); + PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1, + "The number of element of subscript index must be 1"); + if (!context->HasInput("X")) { + return; + } + PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError()); + context->SetOutputDim("Out", context->GetInputDim("X")); + } + + protected: + virtual const char *NotHasXError() const { return "Must set the lod tensor"; } + + virtual const char *NotHasOutError() const { + return "Must set the lod tensor array"; + } +}; + +class WriteToArrayInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto x_name = op_desc.Input("X")[0]; + auto out_name = op_desc.Output("Out")[0]; + VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; + auto &out = block->FindRecursiveOrCreateVar(out_name); + out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY); + auto *x = block->FindVarRecursive(x_name); + if (x != nullptr) { + out.SetDataType(x->GetDataType()); + } + } +}; + +class ReadFromArrayOp : public ArrayOp { + public: + ReadFromArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x != nullptr, "X must be set"); + auto &x_array = x->Get(); + auto *out = scope.FindVar(Output("Out")); + PADDLE_ENFORCE(out != nullptr, "Out must be set"); + size_t offset = GetOffset(scope, place); + if (offset < x_array.size()) { + auto *out_tensor = out->GetMutable(); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Copy(x_array[offset], place, dev_ctx, out_tensor); + out_tensor->set_lod(x_array[offset].lod()); + } else { + VLOG(10) << "offset " << offset << " >= " << x_array.size(); + } + } +}; + +class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(TensorArray) the array will be read from."); + AddInput("I", + "(Tensor) the subscript index in tensor array. The number of " + "element should be 1"); + AddOutput("Out", "(LoDTensor) the tensor will be read from."); + AddComment(R"DOC( +ReadFromArray Operator. + +Read a LoDTensor from a LoDTensor Array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The +equation is + +$$T = A[i]$$ + +)DOC"); + } +}; + +class ReadFromArrayInferShape : public WriteToArrayInferShape { + protected: + const char *NotHasXError() const override { + return "The input array X must be set"; + } + const char *NotHasOutError() const override { + return "The output tensor out must be set"; + } +}; + +class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("read_from_array"); + grad_op->SetInput("I", Input("I")); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("write_to_array"); + grad_op->SetInput("I", Input("I")); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp, + ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker, + ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType); +REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp, + ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker, + ops::ReadFromArrayGradMaker); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c81ea860d0c9fa5498de2d149e3d05d080ad729f --- /dev/null +++ b/paddle/fluid/operators/top_k_op.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/top_k_op.h" + +namespace paddle { +namespace operators { + +class TopkOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Indices"), + "Output(Indices) of TopkOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + const int k = static_cast(ctx->Attrs().Get("k")); + + PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); + PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape"); + PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, + "input must have >= k columns"); + + framework::DDim dims = input_dims; + dims[dims.size() - 1] = k; + ctx->SetOutputDim("Out", dims); + ctx->SetOutputDim("Indices", dims); + ctx->ShareLoD("X", "Out"); + ctx->ShareLoD("X", "Indices"); + } +}; + +class TopkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input of Topk op"); + AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); + AddComment(R"DOC( +Top K operator + +If the input is a vector (1d tensor), this operator finds the k largest +entries in the vector and outputs their values and indices as vectors. +Thus values[j] is the j-th largest entry in input, and its index is indices[j]. + +For matrices, this operator computes the top k entries in each row. )DOC"); + AddAttr("k", + "(int, default 1) Number of top elements to look for along " + "the last dimension (along each row for matrices).") + .SetDefault(1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(top_k, + ops::TopkKernel); diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5390cb5063bcc302f5ff9cfe96bd421b477eeb3f --- /dev/null +++ b/paddle/fluid/operators/top_k_op.cu @@ -0,0 +1,320 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Pair { + __device__ __forceinline__ Pair() {} + __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {} + + __device__ __forceinline__ void set(T value, int64_t id) { + v = value; + id = id; + } + + __device__ __forceinline__ void operator=(const Pair& in) { + v = in.v; + id = in.id; + } + + __device__ __forceinline__ bool operator<(const T value) const { + return (v < value); + } + + __device__ __forceinline__ bool operator<(const Pair& in) const { + return (v < in.v) || ((v == in.v) && (id > in.id)); + } + + __device__ __forceinline__ bool operator>(const Pair& in) const { + return (v > in.v) || ((v == in.v) && (id < in.id)); + } + + T v; + int64_t id; +}; + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, + int beam_size) { + for (int k = beam_size - 2; k >= 0; k--) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p) { + for (int k = beam_size - 2; k >= 0; k--) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size); + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, const Pair& max, + int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + if (tmp < max) { + AddTo(topk, tmp, beam_size); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, + int idx, int dim, int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < val[idx]) { + Pair tmp(val[idx], col[idx]); + AddTo(topk, tmp, beam_size); + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, + int idx, int dim, const Pair& max, + int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < val[idx]) { + Pair tmp(val[idx], col[idx]); + if (tmp < max) { + AddTo(topk, tmp, beam_size); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, + int beam_size, const T* src, + bool& firstStep, bool& is_empty, + Pair& max, int dim, + const int tid) { + if (beam > 0) { + int length = beam < beam_size ? beam : beam_size; + if (firstStep) { + firstStep = false; + GetTopK(topk, src, tid, dim, length); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - beam) { + topk[k] = topk[k + beam]; + } else { + topk[k].set(-INFINITY, -1); + } + } + if (!is_empty) { + GetTopK(topk + MaxLength - beam, src, tid, dim, max, + length); + } + } + + max = topk[MaxLength - 1]; + if (max.v == -1) is_empty = true; + beam = 0; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, + int beam_size, const T* val, + int* col, bool& firstStep, + bool& is_empty, Pair& max, + int dim, const int tid) { + if (beam > 0) { + int length = beam < beam_size ? beam : beam_size; + if (firstStep) { + firstStep = false; + GetTopK(topk, val, col, tid, dim, length); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - beam) { + topk[k] = topk[k + beam]; + } else { + topk[k].set(-INFINITY, -1); + } + } + if (!is_empty) { + GetTopK(topk + MaxLength - beam, val, col, tid, dim, max, + length); + } + } + + max = topk[MaxLength - 1]; + if (max.v == -1) is_empty = true; + beam = 0; + } +} + +template +__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, + Pair topk[], T** topVal, + int64_t** topIds, int& beam, int& k, + const int tid, const int warp) { + while (true) { + __syncthreads(); + if (tid < BlockSize / 2) { + if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } + __syncthreads(); + for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { + if (tid < stride) { + if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } + __syncthreads(); + } + __syncthreads(); + + if (tid == 0) { + **topVal = sh_topk[maxid[0]].v; + **topIds = sh_topk[maxid[0]].id; + (*topVal)++; + (*topIds)++; + } + if (tid == maxid[0]) beam++; + if (--k == 0) break; + __syncthreads(); + + if (tid == maxid[0]) { + if (beam < MaxLength) { + sh_topk[tid] = topk[beam]; + } + } + if (maxid[0] / 32 == warp) { + if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; + } + } +} + +/** + * Each block compute one sample. + * In a block: + * 1. every thread get top MaxLength value; + * 2. merge to sh_topk, block reduce and get max value; + * 3. go to the second setp, until one thread's topk value is null; + * 4. go to the first setp, until get the topk value. + */ +template +__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, + const T* src, int lds, int dim, int k) { + __shared__ Pair sh_topk[BlockSize]; + __shared__ int maxid[BlockSize / 2]; + const int tid = threadIdx.x; + const int warp = threadIdx.x / 32; + output += blockIdx.x * output_stride; + indices += blockIdx.x * k; + + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int k = 0; k < MaxLength; k++) { + topk[k].set(-INFINITY, -1); + } + while (k) { + ThreadGetTopK(topk, beam, k, + src + blockIdx.x * lds, firststep, + is_empty, max, dim, tid); + + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &output, + &indices, beam, k, tid, warp); + } +} + +template +class TopkOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + size_t k = static_cast(ctx.Attr("k")); + + const T* input_data = input->data(); + + T* output_data = output->mutable_data(ctx.GetPlace()); + // FIXME(typhoonzero): data is always converted to type T? + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + + size_t input_height = input->dims()[0]; + size_t input_width = input->dims()[1]; + if (k > input_width) k = input_width; + + // NOTE: pass lds and dim same to input width. + // NOTE: old matrix implementation of stride is different to eigen. + // TODO(typhoonzero): refine this kernel. + dim3 threads(256, 1); + dim3 grid(input_height, 1); + + KeMatrixTopK<<< + grid, threads, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(output_data, output->dims()[1], + indices_data, input_data, + input_width, input_width, int(k)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e32b35150070b30c3ccbbb9483c1f24b1d205919 --- /dev/null +++ b/paddle/fluid/operators/top_k_op.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +class TopkKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Get the top k elements of each row of input tensor + // FIXME: only deal with matrix(2d tensor). + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + // k is determined by Attr + const size_t k = static_cast(ctx.Attr("k")); + + T* output_data = output->mutable_data(ctx.GetPlace()); + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + + auto eg_input = EigenMatrix::From(*input); + + // reshape input to a flattern matrix(like flat_inner_dims) + framework::DDim inputdims = input->dims(); + const size_t row = framework::product( + framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); + const size_t col = inputdims[inputdims.size() - 1]; + Eigen::DSizes flat2dims(row, col); + // NOTE: eigen shape doesn't affect paddle tensor. + eg_input.reshape(flat2dims); + + for (size_t i = 0; i < row; i++) { + std::vector> vec; + for (size_t j = 0; j < col; j++) { + vec.push_back(std::pair(eg_input(i, j), j)); + } + + std::partial_sort( + vec.begin(), vec.begin() + k, vec.end(), + [](const std::pair& l, const std::pair& r) { + return l.first > r.first; + }); + for (size_t j = 0; j < k; j++) { + output_data[i * k + j] = vec[j].first; + indices_data[i * k + j] = int64_t(vec[j].second); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3d8acffc269c1abe98d2de39dcf09fbf3d825f3 --- /dev/null +++ b/paddle/fluid/operators/transpose_op.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/transpose_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class TransposeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + std::vector axis = ctx->Attrs().Get>("axis"); + size_t x_rank = x_dims.size(); + size_t axis_size = axis.size(); + + PADDLE_ENFORCE_EQ(x_rank, axis_size, + "The input tensor's rank(%d) " + "should be equal to the axis's size(%d)", + x_rank, axis_size); + + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_ENFORCE( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + "Each element of Attribute axis should be a unique value " + "range from 0 to (dims - 1), " + "where the dims is the axis's size"); + } + + framework::DDim out_dims(x_dims); + for (size_t i = 0; i < axis_size; i++) { + out_dims[i] = x_dims[axis[i]]; + } + ctx->SetOutputDim("Out", out_dims); + } +}; + +class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor, tensors with rank up to 6 are supported."); + AddOutput("Out", "(Tensor)The output tensor."); + AddAttr>( + "axis", + "(vector) A list of values, and the size of the list should be " + "the same with the input tensor rank. This operator permutes the input " + "tensor's axes according to the values given."); + AddComment(R"DOC( +Transpose Operator. + +The input tensor will be permuted according to the axes given. +The behavior of this operator is similar to how `numpy.transpose` works. + +- suppose the input `X` is a 2-D tensor: + $$ + X = \begin{pmatrix} + 0 &1 &2 \\ + 3 &4 &5 + \end{pmatrix}$$ + + the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis) + + then the output $Y$ is: + + $$ + Y = \begin{pmatrix} + 0 &3 \\ + 1 &4 \\ + 2 &5 + \end{pmatrix}$$ + +- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is +$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$. + +)DOC"); + } +}; + +class TransposeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad, + ops::TransposeOpGrad); +REGISTER_OP_CPU_KERNEL( + transpose, ops::TransposeKernel); +REGISTER_OP_CPU_KERNEL( + transpose_grad, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8667ab369e1de9c2b74ba902242355d1660d24a --- /dev/null +++ b/paddle/fluid/operators/transpose_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/transpose_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + transpose, + ops::TransposeKernel); +REGISTER_OP_CUDA_KERNEL( + transpose_grad, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1fb419474ab078efa64523454bbbbb6176a58d40 --- /dev/null +++ b/paddle/fluid/operators/transpose_op.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +inline void TransCompute(const int dim, const DeviceContext& dev_ctx, + const framework::Tensor& in, framework::Tensor* out, + const std::vector& axis) { + switch (dim) { + case 1: + math::Transpose trans1; + trans1(dev_ctx, in, out, axis); + break; + case 2: + math::Transpose trans2; + trans2(dev_ctx, in, out, axis); + break; + case 3: + math::Transpose trans3; + trans3(dev_ctx, in, out, axis); + break; + case 4: + math::Transpose trans4; + trans4(dev_ctx, in, out, axis); + break; + case 5: + math::Transpose trans5; + trans5(dev_ctx, in, out, axis); + break; + case 6: + math::Transpose trans6; + trans6(dev_ctx, in, out, axis); + break; + default: + PADDLE_THROW("Tensors with rank at most 6 are supported"); + } +} + +template +class TransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + std::vector axis = context.Attr>("axis"); + int ndims = axis.size(); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *x, out, axis); + } +}; + +template +class TransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out_grad = + context.Input(framework::GradVarName("Out")); + auto* x_grad = + context.Output(framework::GradVarName("X")); + if (!x_grad) return; + + x_grad->mutable_data(context.GetPlace()); + std::vector axis = context.Attr>("axis"); + std::vector reversed_axis(axis); + + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + + int ndims = axis.size(); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *out_grad, x_grad, + reversed_axis); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6fea1d4485fd7f88375c96511c85396b707bf1c --- /dev/null +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +// It seems that Eigen::Tensor::random in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class CPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* tensor = ctx.Output("Out"); + T* data = tensor->mutable_data(ctx.GetPlace()); + unsigned int seed = static_cast(ctx.Attr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::uniform_real_distribution dist( + static_cast(ctx.Attr("min")), + static_cast(ctx.Attr("max"))); + int64_t size = tensor->numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + } +}; + +class UniformRandomOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UniformRandomOp should not be null."); + + PADDLE_ENFORCE( + ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), + "uniform_random's min must less then max"); + auto& shape = ctx->Attrs().Get>("shape"); + std::vector temp; + temp.reserve(shape.size()); + for (auto dim : shape) { + temp.push_back(static_cast(dim)); + } + ctx->SetOutputDim("Out", framework::make_ddim(temp)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.GetPlace()); + } +}; + +class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { + public: + UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) The output tensor of uniform random op"); + AddComment(R"DOC( +Uniform random operator. + +This operator initializes a tensor with random values sampled from a +uniform distribution. + +)DOC"); + AddAttr>("shape", + "(vector) The shape of the output tensor"); + AddAttr("min", + "(float, default -1.0) " + "Minimum value of uniform random") + .SetDefault(-1.0f); + AddAttr("max", + "(float, default 1.0) " + "Maximun value of uniform random") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed used for generating samples. " + "0 means use a seed generated by the system.") + .SetDefault(0); + AddAttr("dtype", "(int, default 5(FP32)) Output tensor data type") + .SetDefault(framework::proto::DataType::FP32); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp, + paddle::operators::UniformRandomOpMaker); +REGISTER_OP_CPU_KERNEL(uniform_random, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9afca68e59f8c0eedaf38be4b51343b9cb043f65 --- /dev/null +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +struct UniformGenerator { + T min_, max_; + unsigned int seed_; + + __host__ __device__ UniformGenerator(T min, T max, int seed) + : min_(min), max_(max), seed_(seed) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n); + return dist(rng); + } +}; + +// It seems that Eigen::Tensor::random in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class GPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); + if (seed == 0) { + std::random_device rd; + seed = rd(); + } + T min = static_cast(context.Attr("min")); + T max = static_cast(context.Attr("max")); + thrust::counting_iterator index_sequence_begin(0); + int64_t size = tensor->numel(); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + UniformGenerator(min, max, seed)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(uniform_random, + paddle::operators::GPUUniformRandomKernel, + paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e0b271fed69772a53a776f290d524565df3dc94 --- /dev/null +++ b/paddle/fluid/operators/unpool_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unpool_op.h" +namespace paddle { +namespace operators { + +class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of unpool operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput( + "Indices", + "(Tensor) The input tensor of the indices given out by MaxPool2d. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of unpool operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); + AddAttr>( + "ksize", + "(vector), the unpooling window size(height, width) " + "of unpooling operator."); + AddAttr>("strides", + "(vector, default:{1, 1}), " + "strides (height, width) of unpooling operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector defalut:{0,0}), " + "paddings (height, width) of unpooling operator.") + .SetDefault({0, 0}); + AddAttr( + "unpooling_type", + "(string), unpooling type, can be \"max\" for max-unpooling ") + .InEnum({"max"}); + AddComment(R"DOC( +Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is: +$(N, C_{out}, H_{out}, W_{out})$, where +$$ +H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ +W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] +$$ +Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf +)DOC"); + } +}; + +int OutputSize(int input_size, int ksize, int padding, int stride) { + int output_size = (input_size - 1) * stride - 2 * padding + ksize; + return output_size; +} + +class UnpoolOp : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input(Indices) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UnpoolOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + auto in_y_dims = ctx->GetInputDim("Indices"); + std::string unpooling_type = + ctx->Attrs().Get("unpooling_type"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Unpooling intput must be of 4-dimensional."); + PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class UnpoolOpGrad : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, + ops::UnpoolOpGrad); +REGISTER_OP_CPU_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..15d81eb296ba35d9f67426083870c1a83ff66ee5 --- /dev/null +++ b/paddle/fluid/operators/unpool_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unpool_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ceed5507391b40491d4a26bbc51e8c861e1bf1c2 --- /dev/null +++ b/paddle/fluid/operators/unpool_op.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/unpooling.h" + +namespace paddle { +namespace operators { +template +class UnpoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* in_y = context.Input("Indices"); + auto* out = context.Output("Out"); + std::string unpooling_type = context.Attr("unpooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + T* output_data = out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + if (output_data) { + math::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + } + math::Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(dev_ctx, *in_x, *in_y, out); + } +}; +template +class UnpoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* in_y = context.Input("Indices"); + const framework::Tensor* out = context.Input("Out"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + std::string unpooling_type = context.Attr("unpooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + auto& device_ctx = context.template device_context(); + math::SetConstant zero; + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0)); + } + math::Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c05fed0b47c3bb3582e4b261ef188146d41820e --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/warpctc_op.h" + +namespace paddle { +namespace operators { + +class WarpCTCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"), + "Output(WarpCTCGrad) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of WarpCTCOp should not be null."); + + auto logits_dims = ctx->GetInputDim("Logits"); + int sequence_width = + static_cast(framework::product(logits_dims) / logits_dims[0]); + int blank = ctx->Attrs().Get("blank"); + PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width), + "The value of Attr(blank) should be in interval [0, %d).", + sequence_width); + // TODO(liuyiqun): it is tricky to set the wrong dimension here. + ctx->SetOutputDim("Loss", {logits_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "(LodTensor, default: LoDTensor), the unscaled " + "probabilities of variable-length sequences, which is a 2-D " + "Tensor with LoD information. It's shape is " + "[Lp, num_classes + 1], where Lp is the sum of all input " + "sequences' length and num_classes is the true number of classes " + "(not including the blank label)."); + AddInput("Label", + "(LodTensor, default: LoDTensor), the ground truth " + "of variable-length sequence, which is a 2-D Tensor with LoD " + "information. It is of the shape [Lg, 1], where Lg is th sum of " + "all labels' length."); + AddOutput("WarpCTCGrad", + "(Tensor, default: Tensor), a temporary " + "output Tensor to store the gradients of warp-ctc, which is " + "computed with loss together in one call. It is a 3-D Tensor of " + "the shape [max_sequence_length, batch_size, num_classes + 1].") + .AsIntermediate(); + AddOutput("Loss", + "(Tensor, default: Tensor), the Connectionist " + "Temporal Classification (CTC) loss, which is a 2-D Tensor of " + "the shape [batch_size, 1]"); + AddAttr("blank", + "(int, default: 0), the blank label of Connectionist " + "Temporal Classification (CTC) loss, which is in the " + "half-opened interval [0, num_classes + 1).") + .SetDefault(0); + AddAttr("norm_by_times", + "(bool, default: false), whether to " + "normalize the gradients by the number of time-step, " + "which is also the sequence's length.") + .SetDefault(false); + AddComment(R"DOC( +An operator integrating the open-source +[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in +[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin]( +https://arxiv.org/pdf/1512.02595v1.pdf), +to compute Connectionist Temporal Classification (CTC) loss. +It can be aliased as softmax with ctc, since a native softmax activation is +interated to the warp-ctc library, to to normlize values for each row of the +input tensor. + +More detail of CTC loss can be found by refering to +[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with +Recurrent Neural Networks]( +http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf). +)DOC"); + } +}; + +class WarpCTCGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"), + "Input(WarpCTCGrad) of WarpCTCGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Output(Logits@GRAD) of WarpCTCGradOp should not be null."); + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Logits")); + ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad, + ops::WarpCTCGradOp); +REGISTER_OP_CPU_KERNEL( + warpctc, ops::WarpCTCKernel); +REGISTER_OP_CPU_KERNEL( + warpctc_grad, + ops::WarpCTCGradKernel); diff --git a/paddle/fluid/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ee7f970a9a5f7deaab8a98278cf1a7c051cfbd2 --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/warpctc_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + warpctc, ops::WarpCTCKernel); +REGISTER_OP_CUDA_KERNEL( + warpctc_grad, + ops::WarpCTCGradKernel); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a1de71627ee7e511a03231c11831cf54755a145e --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.h @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" +#include "paddle/fluid/operators/math/sequence_scale.h" +#include "paddle/fluid/platform/dynload/warpctc.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class WarpCTCFunctor { + public: + /* + * \brief Compute the connectionist temporal classification loss, + * and optionally compute the gradient with respect to the inputs. + * + * If gradient is nullptr, it only computes the ctc loss, + * or computes both ctc loss and gradient. + * + * \param ctx execution context of this functor + * \param input batch matrix of input probabilities, in + * max_sequence_length x num_sequences x + * sequence_width, (row-major) format + * \param gradient batch matrix of gradient, with the same shape as + * input. + * \param cpu_labels labels always in CPU memory. + * \param cpu_label_lengths length of all labels in CPU memory. + * \param cpu_input_lengths length of all sequences in CPU memory. + * \param sequence_width number of possible output symbols. + * \param num_sequences number of sequence. + * \param blank blank label used in ctc loss function. + * \param cpu_losss cost of each sequence in CPU memory. + */ + void operator()(const framework::ExecutionContext& ctx, const float* input, + float* gradient, const int* cpu_labels, + const int* cpu_label_lengths, const int* cpu_input_lengths, + const size_t sequence_width, const size_t num_sequences, + const size_t blank, float* cpu_loss) { + // Init warp-ctc options + init(ctx, blank); + + // Compute the required workspace size. + // There is no memory allocated operations within warp-ctc. + size_t workspace_bytes = 0; + ctcStatus_t status = platform::dynload::get_workspace_size( + cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), + static_cast(num_sequences), options_, &workspace_bytes); + PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status, + "warp-ctc [version %d] Error in get_workspace_size: ", + warpctc_version_, + platform::dynload::ctcGetStatusString(status)); + PADDLE_ENFORCE_GT(workspace_bytes, 0UL, + "Bytes of workspace got by warp-ctc function, " + "get_workspace_size(), should be larger than 0."); + + Tensor workspace; + size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL; + float* workspace_data = workspace.mutable_data( + framework::make_ddim({static_cast(workspace_elements)}), + ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), &workspace, + static_cast(0)); + + // compute loss and gradient + status = platform::dynload::compute_ctc_loss( + input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths, + static_cast(sequence_width), static_cast(num_sequences), + cpu_loss, workspace_data, options_); + PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status, + "warp-ctc [version %d] Error in compute_ctc_loss: ", + warpctc_version_, + platform::dynload::ctcGetStatusString(status)); + } + + protected: + void init(const framework::ExecutionContext& ctx, const size_t blank) { + warpctc_version_ = platform::dynload::get_warpctc_version(); + + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + options_.loc = CTC_GPU; + options_.stream = reinterpret_cast( + ctx.device_context()) + .stream(); +#else + PADDLE_THROW("[warpctc init] GPU is not enabled."); +#endif + } else { + options_.loc = CTC_CPU; + options_.num_threads = 1; + } + + options_.blank_label = blank; + } + + private: + int warpctc_version_; + ctcOptions options_; +}; + +template +class WarpCTCKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* logits = ctx.Input("Logits"); + auto* label = ctx.Input("Label"); + auto* warpctc_grad = ctx.Output("WarpCTCGrad"); + auto* loss = ctx.Output("Loss"); + + const size_t level = 0; + + auto logits_lod = framework::ToAbsOffset(logits->lod()); + auto logits_dims = logits->dims(); + PADDLE_ENFORCE_EQ(logits_dims[0], + static_cast(logits_lod[level].back()), + "The first dimension of Input(Logits) should be equal to " + "the sum of all sequences' lengths."); + + auto label_lod = framework::ToAbsOffset(label->lod()); + auto label_dims = label->dims(); + PADDLE_ENFORCE_EQ( + label_dims[0], label->numel(), + "The width of each timestep in Input(Label) should be 1."); + + const size_t num_sequences = logits_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1, + "The number of sequences of Input(Logits) should be " + "equal to that of Input(Label)."); + + const size_t sequence_width = logits->numel() / logits_dims[0]; + auto loss_dims = + framework::make_ddim({static_cast(num_sequences), 1}); + + // warpctc needs sequences data stored in transposed padding format + Tensor warpctc_logits; + const size_t max_sequence_length = + math::MaximumSequenceLength(logits_lod, level); + auto warpctc_logits_dims = + framework::make_ddim({static_cast(max_sequence_length), + static_cast(num_sequences), + static_cast(sequence_width)}); + warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *logits, warpctc_logits, + false); + const T* warpctc_logits_data = warpctc_logits.data(); + + std::vector warpctc_label_lengths(num_sequences); + std::vector warpctc_logits_lengths(num_sequences); + + for (size_t i = 0; i < num_sequences; ++i) { + warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i]; + warpctc_logits_lengths[i] = + logits_lod[level][i + 1] - logits_lod[level][i]; + } + + // warpctc computes loss and gradient in one call, gradient data also stored + // in batch format + T* warpctc_grad_data = + warpctc_grad->mutable_data(warpctc_logits.dims(), ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), warpctc_grad, + static_cast(0)); + + // warpctc accesses labels in CPU memory + Tensor warpctc_label; + Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label); + const int* warpctc_label_data = warpctc_label.data(); + // warpctc stores loss in CPU memory + Tensor warpctc_loss; + T* warpctc_loss_data = + warpctc_loss.mutable_data(loss_dims, platform::CPUPlace()); + + const size_t blank = static_cast(ctx.Attr("blank")); + + WarpCTCFunctor()( + ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), + sequence_width, num_sequences, blank, warpctc_loss_data); + + // Copy the loss back + Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss); + } +}; + +template +class WarpCTCGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* warpctc_grad = ctx.Input("WarpCTCGrad"); + auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); + const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); + + logits_grad->mutable_data(ctx.GetPlace()); + bool norm_by_times = ctx.Attr("norm_by_times"); + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *logits_grad, + *warpctc_grad, norm_by_times); + + const T* loss_grad_data = loss_grad->data(); + math::ScaleLoDTensorFunctor()( + ctx.template device_context(), *logits_grad, + loss_grad_data); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d254c572acff52d967e551c377b3b32b05c92973 --- /dev/null +++ b/paddle/fluid/operators/while_op.cc @@ -0,0 +1,352 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +using StepScopeVar = std::vector; +using LoDTensor = framework::LoDTensor; + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; + +class WhileOp : public framework::OperatorBase { + public: + WhileOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); + auto &cond = scope.FindVar(Input(kCondition))->Get(); + PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); + + framework::Executor executor(dev_place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + auto step_scopes = + scope.FindVar(Output(kStepScopes))->GetMutable(); + + PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), + "Condition of while op must in CPU memory."); + while (cond.data()[0]) { + auto ¤t_scope = scope.NewScope(); + step_scopes->push_back(¤t_scope); + + executor.Run(*program, ¤t_scope, block->ID(), + false /*create_local_scope*/); + } + } +}; + +class WhileOpMaker : public framework::OpProtoAndCheckerMaker { + public: + WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kX, + "A set of variables, which are required by operators inside the " + "block of While Op.") + .AsDuplicable(); + AddInput( + kCondition, + "(Bool) An scalar. When it's False, the While Op will be terminated.") + .AsDuplicable(); + AddOutput(kOutputs, + "A set of variables, which will be assigned with values " + "generated by the operators inside the block of While Op.") + .AsDuplicable(); + AddOutput(kStepScopes, + "(StepScopeVar) A vector of local scope, which size equals the " + "step number of While Op. The i'th scope storages temporary " + "variables generated in the i'th step."); + AddAttr(kStepBlock, + "The step block inside WhileOp"); + AddComment(R"DOC( +)DOC"); + } +}; + +class WhileGradOp : public framework::OperatorBase { + public: + WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Executor executor(dev_place); + auto *block = Attr(kStepBlock); + auto *program = block->Program(); + + auto *step_scopes = + scope.FindVar(Input(kStepScopes))->GetMutable(); + + auto outside_og_names = Inputs(framework::GradVarName(kOutputs)); + auto inside_og_names = + Attr>("original_output_grad"); + + PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size()); + + for (auto cur_scope_iter = step_scopes->rbegin(); + cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { + VLOG(3) << "Start backward at time_step " + << cur_scope_iter - step_scopes->rbegin(); + framework::Scope &cur_scope = **cur_scope_iter; + // Link OG from outside to inside + for (size_t i = 0; i < outside_og_names.size(); ++i) { + auto outside_og_name = outside_og_names[i]; + auto inside_og_name = inside_og_names[i]; + VLOG(8) << "Linking outside " << outside_og_name << " --> inside " + << inside_og_name; + auto &og_outside = + detail::Ref(scope.FindVar(outside_og_name), + "Cannot find Outside Gradient %s", outside_og_name); + auto &og_inside = + detail::Ref(cur_scope.Var(inside_og_name), + "Cannot find inside gradient %s", inside_og_name); + if (og_outside.Type().hash_code() == + typeid(framework::LoDTensor).hash_code()) { + auto &outside_tensor = og_outside.Get(); + auto &inside_tensor = + detail::Ref(og_inside.GetMutable()); + inside_tensor.set_lod(outside_tensor.lod()); + inside_tensor.ShareDataWith(outside_tensor); + } else if (og_outside.Type().hash_code() == + typeid(framework::LoDTensorArray).hash_code()) { + auto &outside_array = og_outside.Get(); + auto &inside_array = + detail::Ref(og_inside.GetMutable()); + VLOG(8) << outside_og_name << " size = " << outside_array.size(); + inside_array.resize(outside_array.size()); + + for (size_t j = 0; j < inside_array.size(); ++j) { + VLOG(8) << j << " " << outside_array[j].numel(); + if (outside_array[j].numel() != 0) { + inside_array[j].set_lod(outside_array[j].lod()); + inside_array[j].ShareDataWith(outside_array[j]); + } else { + PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); + } + } + } + } + + executor.Run(*program, *cur_scope_iter, block->ID(), false); + + auto &pg_names = Outputs(kXGRAD); + auto &p_names = Inputs(kX); + PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + if (pg_names[param_id] == framework::kEmptyVarName) { + continue; // parameter doesn't have gradient + } + auto inside_grad_name = framework::GradVarName(p_names[param_id]); + + // // TODO(tonyyang-svail): Not sure we need the following + // // If does not compute gradient of that variable inside rnn, + // just + // // continue + // if (local_var_names.find(inside_grad_name) == + // local_var_names.end()) { + // continue; + // } + + // zero gradient variable in step 0 + if (cur_scope_iter == step_scopes->rbegin()) { + auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); + if (var->IsType()) { + auto &inside_tensor = var->Get(); + framework::AttributeMap attrs; + attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["value"] = 0.0f; + + auto var_name = pg_names[param_id]; + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", framework::VariableNameMap{}, + {{"Out", {var_name}}}, attrs); + zero_op->Run(scope, dev_place); + scope.FindVar(var_name) + ->GetMutable() + ->set_lod(inside_tensor.lod()); + } + } + + auto new_inside_name = cur_scope.Rename(inside_grad_name); + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + sum_op->Run(cur_scope, dev_place); + cur_scope.Rename(new_inside_name, inside_grad_name); + } + dev_ctx.Wait(); + const_cast(scope).DeleteScope(&cur_scope); + } + } +}; + +class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *while_grad = new framework::OpDesc(); + while_grad->SetType("while_grad"); + while_grad->SetInput(kX, Input(kX)); + while_grad->SetInput(kOutputs, Output(kOutputs)); + while_grad->SetInput(kStepScopes, Output(kStepScopes)); + + auto *grad_block = this->grad_block_[0]; + auto *fwd_block = grad_block->ParentBlock(); + + // Not all of IGs will be generated by inner gradient operators of while op. + // Ignore IGs that is not generated by the inside block. + std::unordered_set inner_op_outputs; + for (const auto *op : grad_block->AllOps()) { + for (auto &oname : op->OutputArgumentNames()) { + inner_op_outputs.insert(oname); + } + } + auto igs = InputGrad(kX, /*do not drop empty gradient*/ false); + for (auto &each_ig : igs) { + if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) { + VLOG(8) << "Ignore " << each_ig; + each_ig = framework::kEmptyVarName; + } + } + while_grad->SetOutput(framework::GradVarName(kX), igs); + + // OG should be re-calculated by step blocks, since many outputs of while op + // do not need to calculate gradients. + std::unordered_set block_ins; + block_ins.reserve(Input(kX).size() + Output(kOutputs).size()); + for (auto &p : Input(kX)) { + block_ins.insert(p); + } + for (auto &o : Output(kOutputs)) { + block_ins.insert(o); + } + std::unordered_set extra_inputs; + for (const auto *op : grad_block->AllOps()) { + for (auto &input_name : op->InputArgumentNames()) { + // If the input of Op has been recorded or is generated by the forward + // block, do not make it as input again. + if (block_ins.find(input_name) != block_ins.end() || + fwd_block->FindVar(input_name) != nullptr) { + continue; + } + extra_inputs.insert(input_name); + } + for (auto &output_name : op->OutputArgumentNames()) { + block_ins.insert(output_name); + } + } + + std::vector extra_inputs_list; + extra_inputs_list.resize(extra_inputs.size()); + std::copy(extra_inputs.begin(), extra_inputs.end(), + extra_inputs_list.begin()); + while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list); + + while_grad->SetAttrMap(this->Attrs()); + while_grad->SetBlockAttr(kStepBlock, *grad_block); + // record the original output gradient names, since the gradient name of + // while operator could be renamed. + while_grad->SetAttr("original_output_grad", extra_inputs_list); + + return std::unique_ptr(while_grad); + } +}; + +class WhileGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto p_names = op_desc.Input(kX); + auto pg_names = op_desc.Output(framework::GradVarName(kX)); + + for (size_t i = 0; i < p_names.size(); ++i) { + auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); + auto *g_var = block->FindVarRecursive(pg_names[i]); + if (g_var != nullptr) { // Gradient could be @EMPTY@ + VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i] + << " type: " << p_var.GetType(); + g_var->SetType(p_var.GetType()); + g_var->SetDataType(p_var.GetDataType()); + } + } + } +}; + +class WhileGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + ctx->HasInputs(kX); + ctx->HasOutputs(framework::GradVarName(kX)); + ctx->HasInputs(kOutputs); + ctx->HasInputs(framework::GradVarName(kOutputs)); + + auto p_names = ctx->Inputs(kX); + auto pg_names = ctx->Outputs(kXGRAD); + auto var_types = ctx->GetInputsVarType(kX); + std::vector names_to_set; + std::vector dims_to_set; + for (size_t i = 0; i < p_names.size(); ++i) { + if (pg_names[i] == framework::kEmptyVarName) { + continue; + } + auto dims = ctx->GetInputsElementDim(kX, i); + if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) { + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims); + } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) { + // not sure how to set the dim of LOD_TENSOR_ARRAY + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims); + } + } + ctx->SetDims(names_to_set, dims_to_set); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(while, paddle::operators::WhileOp, + paddle::operators::WhileOpMaker, + paddle::operators::WhileGradOpDescMaker); +REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp, + paddle::operators::WhileGradOpShapeInference, + paddle::operators::WhileGradOpVarTypeInference); diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115 --- /dev/null +++ b/paddle/fluid/platform/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ce4b3de39d93e1935c6349ae446dec11d2fa986 --- /dev/null +++ b/paddle/fluid/platform/CMakeLists.txt @@ -0,0 +1,41 @@ +if(WITH_GPU) + cc_library(enforce SRCS enforce.cc DEPS nccl) +else() + cc_library(enforce SRCS enforce.cc) +endif() +cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) + +cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce) +cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) + +nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) + +cc_library(place SRCS place.cc DEPS enforce boost) +cc_test(place_test SRCS place_test.cc DEPS place glog gflags) + +add_subdirectory(dynload) + +IF(WITH_GPU) + set(GPU_CTX_DEPS dynload_cuda dynamic_loader) +ELSE() + set(GPU_CTX_DEPS) +ENDIF() + +IF(WITH_MKLDNN) + set(MKLDNN_CTX_DEPS mkldnn) +ELSE() + set(MKLDNN_CTX_DEPS) +ENDIF() + +# memcpy deoends on device_context, here add deps individually for +# avoiding cycle dependencies +cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator + system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) +nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) + +nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) +nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) +nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) + +cc_library(profiler SRCS profiler.cc DEPS device_context) +cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h new file mode 100644 index 0000000000000000000000000000000000000000..1f5a8f6a195738ec3b0681aff8565885258a91fb --- /dev/null +++ b/paddle/fluid/platform/assert.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + +#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG) +#include +#define PADDLE_ASSERT(e) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \ + TOSTRING(e)); \ + asm("trap;"); \ + } \ + } while (0) + +#define PADDLE_ASSERT_MSG(e, m) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed (%s).\n", __FILE__, __LINE__, \ + TOSTRING(e), m); \ + asm("trap;"); \ + } \ + } while (0) +#else +#include +#define PADDLE_ASSERT(e) assert(e) +#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m)) +#endif diff --git a/paddle/platform/call_once.h b/paddle/fluid/platform/call_once.h similarity index 100% rename from paddle/platform/call_once.h rename to paddle/fluid/platform/call_once.h diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..47473aead0e512005a63e60b01170b41500dd1f6 --- /dev/null +++ b/paddle/fluid/platform/cpu_info.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/cpu_info.h" + +#ifdef __APPLE__ +#include +#include +#else +#include +#endif + +#include "gflags/gflags.h" + +DEFINE_double(fraction_of_cpu_memory_to_use, 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +inline size_t CpuTotalPhysicalMemory() { +#ifdef __APPLE__ + int mib[2]; + mib[0] = CTL_HW; + mib[1] = HW_MEMSIZE; + int64_t size = 0; + size_t len = sizeof(size); + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; + return 0L; +#else + int64_t pages = sysconf(_SC_PHYS_PAGES); + int64_t page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +#endif +} + +size_t CpuMaxAllocSize() { + // For distributed systems, it requires configuring and limiting + // the fraction of memory to use. + return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); +} + +size_t CpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 4 KB. + return 1 << 12; +} + +size_t CpuMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. + return CpuMaxAllocSize() / 32; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h similarity index 100% rename from paddle/platform/cpu_info.h rename to paddle/fluid/platform/cpu_info.h diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1fdba13b80629902faf2bcebb646572b36b459e --- /dev/null +++ b/paddle/fluid/platform/cpu_info_test.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/string/printf.h" + +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +DECLARE_double(fraction_of_cpu_memory_to_use); + +TEST(CpuMemoryUsage, Print) { + std::stringstream ss; + size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024; + float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100; + + std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n", + use_percent, memory_size) + << std::endl; +} diff --git a/paddle/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h similarity index 100% rename from paddle/platform/cuda_helper.h rename to paddle/fluid/platform/cuda_helper.h diff --git a/paddle/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h similarity index 100% rename from paddle/platform/cuda_profiler.h rename to paddle/fluid/platform/cuda_profiler.h diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..f2daa4f4fcc6fe43c2950b413024df7e301abf50 --- /dev/null +++ b/paddle/fluid/platform/cudnn_helper.h @@ -0,0 +1,286 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +inline const char* cudnnGetErrorString(cudnnStatus_t status) { + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + default: + return "Unknown cudnn error number"; + } +} + +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) + +#define CUDNN_ENFORCE(condition) \ + do { \ + cudnnStatus_t status = condition; \ + if (status != CUDNN_STATUS_SUCCESS) { \ + VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \ + PADDLE_THROW("cuDNN call failed"); \ + } \ + } while (false) + +enum class DataLayout { // Not use + kNHWC, + kNCHW, + kNCDHW, + kNCHW_VECT_C, +}; + +enum class PoolingMode { + kMaximum, + kAverage, +}; + +template +class CudnnDataType; + +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_FLOAT; + typedef const float ScalingParamType; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; + typedef const double ScalingParamType; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +inline cudnnTensorFormat_t GetCudnnTensorFormat( + const DataLayout& order) { // Not use + switch (order) { + case DataLayout::kNHWC: + return CUDNN_TENSOR_NHWC; + case DataLayout::kNCHW: + return CUDNN_TENSOR_NCHW; + case DataLayout::kNCDHW: + return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same + default: + PADDLE_THROW("Unknown cudnn equivalent for order"); + } + return CUDNN_TENSOR_NCHW; +} + +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_)); + } + ~ScopedTensorDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_)); + } + + inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, + const cudnnDataType_t type, + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + // Update tensor descriptor dims setting if groups > 1 + // NOTE: Assume using NCHW or NCDHW order + std::vector dims_with_group(dims.begin(), dims.end()); // copy + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( + desc_, type, dims_with_group.size(), dims_with_group.data(), + strides.data())); + return desc_; + } + + template + inline cudnnTensorDescriptor_t descriptor(const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, dims, + groups); + } + + private: + cudnnTensorDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_)); + } + ~ScopedFilterDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_)); + } + + inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, + const cudnnDataType_t type, + const std::vector& kernel, + const int groups = 1) { + // filter layout: MCHW(MCDHW), where M is the number of + // output image channels, C is the number of input image channels, + // D is the depth of the filter, H is the height of the filter, and W is the + // width of the filter. + std::vector kernel_with_group(kernel.begin(), kernel.end()); + if (groups > 1) { + kernel_with_group[0] /= groups; + // NOTE: input filter(C) of the filter is already asserted to be C/groups. + } + PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor( + desc_, type, format, kernel_with_group.size(), + kernel_with_group.data())); + return desc_; + } + + template + inline cudnnFilterDescriptor_t descriptor(const DataLayout& order, + const std::vector& kernel, + const int groups = 1) { + return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, + kernel, groups); + } + + private: + cudnnFilterDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_)); + } + ~ScopedConvolutionDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_)); + } + + inline cudnnConvolutionDescriptor_t descriptor( + cudnnDataType_t type, const std::vector& pads, + const std::vector& strides, const std::vector& dilations) { + PADDLE_ENFORCE_EQ(pads.size(), strides.size()); + PADDLE_ENFORCE_EQ(pads.size(), dilations.size()); + +#if !CUDNN_VERSION_MIN(6, 0, 0) + // cudnn v5 does not support dilation conv, the argument is called upscale + // instead of dilations and it is must be one. + for (size_t i = 0; i < dilations.size(); ++i) { + PADDLE_ENFORCE_EQ( + dilations[i], 1, + "Dilations conv is not supported in this cuDNN version(%d.%d.%d).", + CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100, + CUDNN_VERSION % 100); + } +#endif + + PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( + desc_, pads.size(), pads.data(), strides.data(), dilations.data(), + CUDNN_CROSS_CORRELATION, type)); + return desc_; + } + + template + inline cudnnConvolutionDescriptor_t descriptor( + const std::vector& pads, const std::vector& strides, + const std::vector& dilations) { + return descriptor(CudnnDataType::type, pads, strides, dilations); + } + + private: + cudnnConvolutionDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_)); + } + ~ScopedPoolingDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_)); + } + + inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode, + const std::vector& kernel, + const std::vector& pads, + const std::vector& strides) { + PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); + PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); + PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor( + desc_, (mode == PoolingMode::kMaximum + ? CUDNN_POOLING_MAX + : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING), + CUDNN_PROPAGATE_NAN, // Always propagate nans. + kernel.size(), kernel.data(), pads.data(), strides.data())); + return desc_; + } + + private: + cudnnPoolingDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cd0bd3fe3ed115c4a91723e1023851456da74890 --- /dev/null +++ b/paddle/fluid/platform/cudnn_helper_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/cudnn_helper.h" +#include + +TEST(CudnnHelper, ScopedTensorDescriptor) { + using paddle::platform::ScopedTensorDescriptor; + using paddle::platform::DataLayout; + + ScopedTensorDescriptor tensor_desc; + std::vector shape = {2, 4, 6, 6}; + auto desc = tensor_desc.descriptor(DataLayout::kNCHW, shape); + + cudnnDataType_t type; + int nd; + std::vector dims(4); + std::vector strides(4); + paddle::platform::dynload::cudnnGetTensorNdDescriptor( + desc, 4, &type, &nd, dims.data(), strides.data()); + + EXPECT_EQ(nd, 4); + for (size_t i = 0; i < dims.size(); ++i) { + EXPECT_EQ(dims[i], shape[i]); + } + EXPECT_EQ(strides[3], 1); + EXPECT_EQ(strides[2], 6); + EXPECT_EQ(strides[1], 36); + EXPECT_EQ(strides[0], 144); + + // test tensor5d: ScopedTensorDescriptor + ScopedTensorDescriptor tensor5d_desc; + std::vector shape_5d = {2, 4, 6, 6, 6}; + auto desc_5d = tensor5d_desc.descriptor(DataLayout::kNCDHW, shape_5d); + + std::vector dims_5d(5); + std::vector strides_5d(5); + paddle::platform::dynload::cudnnGetTensorNdDescriptor( + desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data()); + + EXPECT_EQ(nd, 5); + for (size_t i = 0; i < dims_5d.size(); ++i) { + EXPECT_EQ(dims_5d[i], shape_5d[i]); + } + EXPECT_EQ(strides_5d[4], 1); + EXPECT_EQ(strides_5d[3], 6); + EXPECT_EQ(strides_5d[2], 36); + EXPECT_EQ(strides_5d[1], 216); + EXPECT_EQ(strides_5d[0], 864); +} + +TEST(CudnnHelper, ScopedFilterDescriptor) { + using paddle::platform::ScopedFilterDescriptor; + using paddle::platform::DataLayout; + + ScopedFilterDescriptor filter_desc; + std::vector shape = {2, 3, 3}; + auto desc = filter_desc.descriptor(DataLayout::kNCHW, shape); + + cudnnDataType_t type; + int nd; + cudnnTensorFormat_t format; + std::vector kernel(3); + paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format, + &nd, kernel.data()); + + EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format); + EXPECT_EQ(nd, 3); + for (size_t i = 0; i < shape.size(); ++i) { + EXPECT_EQ(kernel[i], shape[i]); + } + + ScopedFilterDescriptor filter_desc_4d; + std::vector shape_4d = {2, 3, 3, 3}; + auto desc_4d = filter_desc.descriptor(DataLayout::kNCDHW, shape_4d); + + std::vector kernel_4d(4); + paddle::platform::dynload::cudnnGetFilterNdDescriptor( + desc_4d, 4, &type, &format, &nd, kernel_4d.data()); + + EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format); + EXPECT_EQ(nd, 4); + for (size_t i = 0; i < shape_4d.size(); ++i) { + EXPECT_EQ(kernel_4d[i], shape_4d[i]); + } +} + +TEST(CudnnHelper, ScopedConvolutionDescriptor) { + using paddle::platform::ScopedConvolutionDescriptor; + + ScopedConvolutionDescriptor conv_desc; + std::vector src_pads = {2, 2, 2}; + std::vector src_strides = {1, 1, 1}; + std::vector src_dilations = {1, 1, 1}; + auto desc = conv_desc.descriptor(src_pads, src_strides, src_dilations); + + cudnnDataType_t type; + cudnnConvolutionMode_t mode; + int nd; + std::vector pads(3); + std::vector strides(3); + std::vector dilations(3); + paddle::platform::dynload::cudnnGetConvolutionNdDescriptor( + desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode, + &type); + + EXPECT_EQ(nd, 3); + for (size_t i = 0; i < src_pads.size(); ++i) { + EXPECT_EQ(pads[i], src_pads[i]); + EXPECT_EQ(strides[i], src_strides[i]); + EXPECT_EQ(dilations[i], src_dilations[i]); + } + EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION); +} + +TEST(CudnnHelper, ScopedPoolingDescriptor) { + using paddle::platform::ScopedPoolingDescriptor; + using paddle::platform::PoolingMode; + + ScopedPoolingDescriptor pool_desc; + std::vector src_kernel = {2, 2, 5}; + std::vector src_pads = {1, 1, 2}; + std::vector src_strides = {2, 2, 3}; + auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads, + src_strides); + + cudnnPoolingMode_t mode; + cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN; + int nd; + std::vector kernel(3); + std::vector pads(3); + std::vector strides(3); + paddle::platform::dynload::cudnnGetPoolingNdDescriptor( + desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data()); + + EXPECT_EQ(nd, 3); + for (size_t i = 0; i < src_pads.size(); ++i) { + EXPECT_EQ(kernel[i], src_kernel[i]); + EXPECT_EQ(pads[i], src_pads[i]); + EXPECT_EQ(strides[i], src_strides[i]); + } + EXPECT_EQ(mode, CUDNN_POOLING_MAX); +} diff --git a/paddle/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/device_ptr_cast.h similarity index 100% rename from paddle/platform/details/device_ptr_cast.h rename to paddle/fluid/platform/details/device_ptr_cast.h diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4da846bb1c25abc3d31006657652abaa5a11add --- /dev/null +++ b/paddle/fluid/platform/device_context.cc @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace platform { + +DeviceContextPool* DeviceContextPool::pool = nullptr; + +const platform::DeviceContext* DeviceContextPool::Get( + const platform::Place& place) { + auto it = device_contexts_.find(place); + if (it == device_contexts_.end()) { + PADDLE_THROW( + "'Place' is not supported, Please re-compile with WITH_GPU " + "option"); + } + return it->second; +} + +DeviceContextPool::DeviceContextPool( + const std::vector& places) { + PADDLE_ENFORCE_GT(places.size(), 0); + for (size_t i = 0; i < places.size(); i++) { + if (platform::is_cpu_place(places[i])) { + device_contexts_.emplace(places[i], + new platform::CPUDeviceContext( + boost::get(places[i]))); + } else if (platform::is_gpu_place(places[i])) { +#ifdef PADDLE_WITH_CUDA + device_contexts_.emplace(places[i], + new platform::CUDADeviceContext( + boost::get(places[i]))); +#else + PADDLE_THROW( + "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " + "option"); +#endif + } + } +} + +CPUDeviceContext::CPUDeviceContext() { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place CPUDeviceContext::GetPlace() const { return place_; } + +#ifdef PADDLE_WITH_CUDA + +class EigenCudaStreamDevice : public Eigen::StreamInterface { + public: + EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); + } + ~EigenCudaStreamDevice() override {} + + void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) { + stream_ = cuda_stream; + place_ = place; + device_prop_ = &Eigen::m_deviceProperties[place.device]; + } + + const cudaStream_t& stream() const override { return *stream_; } + + const cudaDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + return paddle::memory::Alloc(place_, num_bytes); + } + + void deallocate(void* buffer) const override { + paddle::memory::Free(place_, buffer); + } + + void* scratchpad() const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + unsigned int* semaphore() const override { + if (semaphore_ == NULL) { + char* scratch = + static_cast(scratchpad()) + Eigen::kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + PADDLE_ENFORCE( + cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); + } + return semaphore_; + } + + private: + CUDAPlace place_; + const cudaStream_t* stream_; // not owned; + const cudaDeviceProp* device_prop_; // not owned; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + eigen_stream_.reset(new EigenCudaStreamDevice()); + eigen_stream_->Reinitialize(&stream_, place); + eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + if (dynload::HasCUDNN()) { + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); + } else { + cudnn_handle_ = nullptr; + } +} + +CUDADeviceContext::~CUDADeviceContext() { + SetDeviceId(place_.device); + Wait(); + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + if (cudnn_handle_ != nullptr) { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + } + eigen_stream_.reset(); + eigen_device_.reset(); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +Place CUDADeviceContext::GetPlace() const { return place_; } + +void CUDADeviceContext::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaGetLastError()); +} + +Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +cublasHandle_t CUDADeviceContext::cublas_handle() const { + return cublas_handle_; +} + +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } + +cudaStream_t CUDADeviceContext::stream() const { return stream_; } + +#endif + +#ifdef PADDLE_WITH_MKLDNN +MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) + : CPUDeviceContext(place), ready_(false) { + stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); + engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0)); +} + +template +void MKLDNNDeviceContext::AddElement(const std::string& op_key, + const T& value) { + if (GetElement(op_key)) { + return; + } + GetElementPool().emplace(op_key, std::move(value)); +} + +template +const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const { + auto it = GetElementPool().find(op_key); + return it == GetElementPool().end() ? nullptr : it->second; +} + +template <> +const std::unordered_map>& +MKLDNNDeviceContext::GetElementPool() const { + return memory_pool_; +} + +template <> +const std::unordered_map>& +MKLDNNDeviceContext::GetElementPool() const { + return primitive_pool_; +} + +template <> +const std::unordered_map>& +MKLDNNDeviceContext::GetElementPool() const { + return primitive_desc_pool_; +} + +void MKLDNNDeviceContext::Execute(bool block) { + if (pipeline_.empty()) { + return; + } + ResetStream(); + stream_->submit(pipeline_).wait(block); + ready_ = false; + pipeline_.clear(); +} + +void MKLDNNDeviceContext::ResetStream() { + if (ready_) { + return; + } + // TODO(TJ): change me when mkldnn have specific method to reset this state + stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); + ready_ = true; +} + +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..10b581f41a1e9473a2f85d3e5d2e40ee1fdaa1af --- /dev/null +++ b/paddle/fluid/platform/device_context.h @@ -0,0 +1,210 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/gpu_info.h" +#define EIGEN_USE_GPU +#endif + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "unsupported/Eigen/CXX11/Tensor" + +#include "glog/logging.h" + +namespace paddle { +namespace platform { + +class DeviceContext { + public: + virtual ~DeviceContext() {} + virtual Place GetPlace() const = 0; + + virtual void Wait() const {} +}; + +class CPUDeviceContext : public DeviceContext { + public: + CPUDeviceContext(); + explicit CPUDeviceContext(CPUPlace place); + + Eigen::DefaultDevice* eigen_device() const; + + Place GetPlace() const override; + + private: + CPUPlace place_; + std::unique_ptr eigen_device_; +}; + +template +struct DefaultDeviceContextType; + +template <> +struct DefaultDeviceContextType { + using TYPE = CPUDeviceContext; +}; + +#ifdef PADDLE_WITH_CUDA + +class EigenCudaStreamDevice; + +class CUDADeviceContext : public DeviceContext { + public: + explicit CUDADeviceContext(CUDAPlace place); + virtual ~CUDADeviceContext(); + + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + + /*! \brief Return place in the device context. */ + Place GetPlace() const override; + + /*! \brief Return eigen device in the device context. */ + Eigen::GpuDevice* eigen_device() const; + + /*! \brief Return cublas handle in the device context. */ + cublasHandle_t cublas_handle() const; + + /*! \brief Return cudnn handle in the device context. */ + cudnnHandle_t cudnn_handle() const; + + /*! \brief Return cuda stream in the device context. */ + cudaStream_t stream() const; + + private: + CUDAPlace place_; + + std::unique_ptr eigen_device_; + std::unique_ptr eigen_stream_; + + cudaStream_t stream_; + cudnnHandle_t cudnn_handle_; + cublasHandle_t cublas_handle_; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = CUDADeviceContext; +}; + +#endif + +#ifdef PADDLE_WITH_MKLDNN +class MKLDNNDeviceContext : public CPUDeviceContext { + public: + explicit MKLDNNDeviceContext(CPUPlace place); + + /* \brief Add new element: memory, primitive or primitive desc */ + template + void AddElement(const std::string& op_key, const T& value); + + /* \brief Get existed element: memory, primitive or primitive desc */ + template + const T& GetElement(const std::string& op_key) const; + + /* \brief Get element pool: memory, primitive or primitive desc pool */ + template + const std::unordered_map>& + GetElementPool() const; + + /* \brief Get the active engine */ + const MKLDNNEngine& engine() const { return *engine_; } + + /* \brief Submit primitive to pipeline */ + void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); } + + /*! \brief Execute all submitted primitives in pipeline */ + void Execute(bool block = true); + + protected: + /*! \brief Reset the stream to prepare next exectue */ + void ResetStream(); + + private: + std::unordered_map> + memory_pool_; + std::unordered_map> + primitive_pool_; + std::unordered_map> + primitive_desc_pool_; + std::vector pipeline_; + MKLDNNStreamPtr stream_; + MKLDNNEnginePtr engine_; + bool ready_; +}; +#endif + +/*! \brief device context pool singleton */ +class DeviceContextPool { + public: + explicit DeviceContextPool(const std::vector& places); + + static DeviceContextPool& Instance() { + PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); + return *pool; + } + + /*! \brief Create should only called by Init function */ + static DeviceContextPool& Init(const std::vector& places) { + if (pool == nullptr) { + pool = new DeviceContextPool(places); + } + return *pool; + } + + /*! \brief Return handle of single device context. */ + const platform::DeviceContext* Get(const platform::Place& place); + + template + const typename DefaultDeviceContextType::TYPE* GetByPlace( + const Place& place) { + return reinterpret_cast< + const typename DefaultDeviceContextType::TYPE*>(Get(place)); + } + + size_t size() const { return device_contexts_.size(); } + + private: + static DeviceContextPool* pool; + constexpr static int LEFT_SHIFT = 8; + struct Hash { + std::hash hash_; + size_t operator()(const platform::Place& place) const { + int pre_hash = place.which() << LEFT_SHIFT; + if (platform::is_gpu_place(place)) { + pre_hash += boost::get(place).GetDeviceId(); + } + return hash_(pre_hash); + } + }; + std::unordered_map + device_contexts_; + DISABLE_COPY_AND_ASSIGN(DeviceContextPool); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..f4dae6e90a8d12fc2dccab6fd3c6881e58e80fed --- /dev/null +++ b/paddle/fluid/platform/device_context_test.cu @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +#include "glog/logging.h" + +TEST(Device, Init) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); + ASSERT_NE(nullptr, gpu_device); + delete device_context; + } +} + +TEST(Device, CUDADeviceContext) { + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); + ASSERT_NE(nullptr, gpu_device); + cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); + ASSERT_NE(nullptr, cudnn_handle); + cublasHandle_t cublas_handle = device_context->cublas_handle(); + ASSERT_NE(nullptr, cublas_handle); + ASSERT_NE(nullptr, device_context->stream()); + delete device_context; + } +} + +TEST(Device, DeviceContextPool) { + using paddle::platform::DeviceContextPool; + using paddle::platform::CUDADeviceContext; + using paddle::platform::Place; + using paddle::platform::CPUPlace; + using paddle::platform::CUDAPlace; + + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); + + std::vector gpu_places; + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); + } +} + +int main(int argc, char** argv) { + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::CUDAPlace(i)); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt similarity index 100% rename from paddle/platform/dynload/CMakeLists.txt rename to paddle/fluid/platform/dynload/CMakeLists.txt diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc new file mode 100644 index 0000000000000000000000000000000000000000..c599712554b1d6183c896eb7fc6ac5bbf71d67fc --- /dev/null +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/cublas.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag cublas_dso_flag; +void *cublas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h new file mode 100644 index 0000000000000000000000000000000000000000..05f69e506515ac092c8509fba26e5b7a0f0823f7 --- /dev/null +++ b/paddle/fluid/platform/dynload/cublas.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cublas_dso_flag; +extern void *cublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cublasStatus_t operator()(Args... args) { \ + typedef cublasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, \ + paddle::platform::dynload::GetCublasDsoHandle, \ + &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cublasStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ + DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSaxpy_v2); \ + __macro(cublasDaxpy_v2); \ + __macro(cublasSgemv_v2); \ + __macro(cublasDgemv_v2); \ + __macro(cublasSgemm_v2); \ + __macro(cublasDgemm_v2); \ + __macro(cublasSgeam_v2); \ + __macro(cublasDgeam_v2); \ + __macro(cublasCreate_v2); \ + __macro(cublasDestroy_v2); \ + __macro(cublasSetStream_v2); \ + __macro(cublasSetPointerMode_v2); \ + __macro(cublasGetPointerMode_v2); \ + __macro(cublasSgemmBatched); \ + __macro(cublasDgemmBatched); \ + __macro(cublasCgemmBatched); \ + __macro(cublasZgemmBatched); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasSgetrfBatched); \ + __macro(cublasSgetriBatched); \ + __macro(cublasDgetrfBatched); \ + __macro(cublasDgetriBatched) + +CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b1c4c4f9609ebb61674c07f6cfd615d9674dfcd --- /dev/null +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag cudnn_dso_flag; +void* cudnn_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); +CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3 +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R5 +CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R7 +CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + +#ifdef PADDLE_USE_DSO +bool HasCUDNN() { + std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle); + return cudnn_dso_handle != nullptr; +} + +void EnforceCUDNNLoaded(const char* fn_name) { + PADDLE_ENFORCE(cudnn_dso_handle != nullptr, + "Cannot load cudnn shared library. Cannot invoke method %s", + fn_name); +} +#else +bool HasCUDNN() { return true; } +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h new file mode 100644 index 0000000000000000000000000000000000000000..00dfbc83872ed04d2b4e840e3f6ac31e89a8a3cd --- /dev/null +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cudnn_dso_flag; +extern void* cudnn_dso_handle; +extern bool HasCUDNN(); + +#ifdef PADDLE_USE_DSO + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, \ + paddle::platform::dynload::GetCUDNNDsoHandle, \ + &cudnn_dso_handle); \ + EnforceCUDNNLoaded(#__name); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +#else + +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#endif + +/** + * include all needed cudnn functions in HPPL + * different cudnn version has different interfaces + **/ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ + __macro(cudnnGetErrorString); +CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ + __macro(cudnnAddTensor); \ + __macro(cudnnConvolutionBackwardData); \ + __macro(cudnnConvolutionBackwardFilter); +CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R3: +#if CUDNN_VERSION >= 3000 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ + __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +// APIs available after R4: +#if CUDNN_VERSION >= 4007 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ + __macro(cudnnBatchNormalizationForwardTraining); \ + __macro(cudnnBatchNormalizationForwardInference); \ + __macro(cudnnBatchNormalizationBackward); +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +// APIs in R5 +#if CUDNN_VERSION >= 5000 +#define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \ + __macro(cudnnCreateActivationDescriptor); \ + __macro(cudnnSetActivationDescriptor); \ + __macro(cudnnGetActivationDescriptor); \ + __macro(cudnnDestroyActivationDescriptor); +CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 7001 +#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(cudnnSetConvolutionGroupCount); +CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc new file mode 100644 index 0000000000000000000000000000000000000000..eac690b1458a4e53f6958b77d01d2c3c9f26f6eb --- /dev/null +++ b/paddle/fluid/platform/dynload/curand.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/curand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag curand_dso_flag; +void *curand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h new file mode 100644 index 0000000000000000000000000000000000000000..ce3115b3ce0dbaa30af63bfc15908f81c16309d1 --- /dev/null +++ b/paddle/fluid/platform/dynload/curand.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag curand_dso_flag; +extern void *curand_dso_handle; +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + typedef curandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, \ + paddle::platform::dynload::GetCurandDsoHandle, \ + &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(curandCreateGenerator); \ + __macro(curandSetStream); \ + __macro(curandSetPseudoRandomGeneratorSeed); \ + __macro(curandGenerateUniform); \ + __macro(curandGenerateUniformDouble); \ + __macro(curandGenerateNormal); \ + __macro(curandDestroyGenerator); + +CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb00f93b7cde0a39beb4adabd910eef634c2581c --- /dev/null +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_string(cudnn_dir, "", + "Specify path for loading libcudnn.so. For instance, " + "/usr/local/cudnn/lib. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); + +DEFINE_string(cuda_dir, "", + "Specify path for loading cuda library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + +DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); + +DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); + +DEFINE_string(nccl_dir, "", + "Specify path for loading nccl library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + +namespace paddle { +namespace platform { +namespace dynload { + +static inline std::string join(const std::string& part1, + const std::string& part2) { + // directory separator + const char sep = '/'; + if (!part2.empty() && part2.front() == sep) { + return part2; + } + std::string ret; + ret.reserve(part1.size() + part2.size() + 1); + ret = part1; + if (!ret.empty() && ret.back() != sep) { + ret += sep; + } + ret += part2; + return ret; +} + +static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, + void** dso_handle, + int dynload_flags) { + VLOG(3) << "Try to find library: " << dso_path + << " from default system path."; + // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + *dso_handle = dlopen(dso_path.c_str(), dynload_flags); + +// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to +// bring System Integrity Projection (SIP), if dso_handle +// is null, search from default package path in Mac OS. +#if defined(__APPLE__) || defined(__OSX__) + if (nullptr == *dso_handle) { + dso_path = join("/usr/local/cuda/lib/", dso_path); + *dso_handle = dlopen(dso_path.c_str(), dynload_flags); + if (nullptr == *dso_handle) { + if (dso_path == "libcudnn.dylib") { + LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " + "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " + "chmod a+r /usr/local/cuda/include/cudnn.h " + "/usr/local/cuda/lib/libcudnn*"; + } + } + } +#endif +} + +static inline void GetDsoHandleFromSearchPath(const std::string& search_root, + const std::string& dso_name, + void** dso_handle, + bool throw_on_error = true) { + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; + *dso_handle = nullptr; + + std::string dlPath = dso_name; + if (search_root.empty()) { + GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); + } else { + // search xxx.so from custom path + dlPath = join(search_root, dso_name); + *dso_handle = dlopen(dlPath.c_str(), dynload_flags); + // if not found, search from default path + if (nullptr == *dso_handle) { + LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" + << dlerror() << ")"; + dlPath = dso_name; + GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); + } + } + auto error_msg = + "Failed to find dynamic library: %s ( %s ) \n Please specify " + "its path correctly using following ways: \n Method. set " + "environment variable LD_LIBRARY_PATH on Linux or " + "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " + "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " + "using the DYLD_LIBRARY_PATH is impossible unless System " + "Integrity Protection (SIP) is disabled."; + if (throw_on_error) { + PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror()); + } else if (nullptr == *dso_handle) { + LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); + } +} + +void GetCublasDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle); +#endif +} + +void GetCUDNNDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle, + false); +#else + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false); +#endif +} + +void GetCurandDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); +#endif +} + +void GetWarpCTCDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); +#endif +} + +void GetLapackDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); +#endif +} + +void GetNCCLDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle); +#endif +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h similarity index 100% rename from paddle/platform/dynload/dynamic_loader.h rename to paddle/fluid/platform/dynload/dynamic_loader.h diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc new file mode 100644 index 0000000000000000000000000000000000000000..1dc3e96f04a4f98e5cf0cb7848a99213ca082b79 --- /dev/null +++ b/paddle/fluid/platform/dynload/nccl.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/nccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nccl_dso_flag; +void *nccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +void LoadNCCLDSO() { + platform::call_once(nccl_dso_flag, + [] { GetNCCLDsoHandle(&nccl_dso_handle); }); +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h new file mode 100644 index 0000000000000000000000000000000000000000..349a4d0ba325fe1f7c23873c82dce0d35d295340 --- /dev/null +++ b/paddle/fluid/platform/dynload/nccl.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/call_once.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag nccl_dso_flag; +extern void* nccl_dso_handle; + +#ifdef PADDLE_USE_DSO +extern void LoadNCCLDSO(); + +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(__name(args...)) (*)(Args...); \ + paddle::platform::dynload::LoadNCCLDSO(); \ + void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + ncclResult_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define NCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(ncclCommInitAll); \ + __macro(ncclGetUniqueId); \ + __macro(ncclCommInitRank); \ + __macro(ncclCommDestroy); \ + __macro(ncclCommCount); \ + __macro(ncclCommCuDevice); \ + __macro(ncclCommUserRank); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ + __macro(ncclReduce); \ + __macro(ncclGetErrorString); + +NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc new file mode 100644 index 0000000000000000000000000000000000000000..84de2cae94790ca44c8c0f87d75a45a1b7001a64 --- /dev/null +++ b/paddle/fluid/platform/dynload/warpctc.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/warpctc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag warpctc_dso_flag; +void* warpctc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +WARPCTC_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h new file mode 100644 index 0000000000000000000000000000000000000000..f1955818dede54b0d9c53cb33b95c6102b854d7b --- /dev/null +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ctc.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag warpctc_dso_flag; +extern void* warpctc_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ + std::call_once(warpctc_dso_flag, \ + paddle::platform::dynload::GetWarpCTCDsoHandle, \ + &warpctc_dso_handle); \ + void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + DYNAMIC_LOAD_WARPCTC_WRAP(__name) + +#define WARPCTC_ROUTINE_EACH(__macro) \ + __macro(get_warpctc_version); \ + __macro(ctcGetStatusString); \ + __macro(compute_ctc_loss); \ + __macro(get_workspace_size) + +WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce.cc b/paddle/fluid/platform/enforce.cc new file mode 100644 index 0000000000000000000000000000000000000000..55cd80943cf18545d3cac6f3ebddee1030d62b37 --- /dev/null +++ b/paddle/fluid/platform/enforce.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform {} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h new file mode 100644 index 0000000000000000000000000000000000000000..b22893c0a56f39347894924b3cd6ea64180aa8b6 --- /dev/null +++ b/paddle/fluid/platform/enforce.h @@ -0,0 +1,258 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for dladdr +#include // for backtrace +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/macros.h" +#include "paddle/string/printf.h" +#include "paddle/string/to_string.h" + +#ifdef __GNUC__ +#include // for __cxa_demangle +#endif + +#include + +#ifdef PADDLE_WITH_CUDA + +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/dynload/curand.h" +#include "paddle/fluid/platform/dynload/nccl.h" + +#include +#include +#include +#include +#include + +#endif + +namespace paddle { +namespace platform { + +#ifdef __GNUC__ +inline std::string demangle(std::string name) { + int status = -4; // some arbitrary value to eliminate the compiler warning + std::unique_ptr res{ + abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free}; + return (status == 0) ? res.get() : name; +} +#else +inline std::string demangle(std::string name) { return name; } +#endif + +struct EnforceNotMet : public std::exception { + std::exception_ptr exp_; + std::string err_str_; + EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { + static constexpr int TRACE_STACK_LIMIT = 100; + try { + std::rethrow_exception(exp_); + } catch (const std::exception& exp) { + std::ostringstream sout; + + sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl; + sout << "PaddlePaddle Call Stacks: " << std::endl; + + void* call_stack[TRACE_STACK_LIMIT]; + auto size = backtrace(call_stack, TRACE_STACK_LIMIT); + auto symbols = backtrace_symbols(call_stack, size); + + Dl_info info; + for (int i = 0; i < size; ++i) { + if (dladdr(call_stack[i], &info) && info.dli_sname) { + auto demangled = demangle(info.dli_sname); + auto addr_offset = static_cast(call_stack[i]) - + static_cast(info.dli_saddr); + sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, + 2 + sizeof(void*) * 2, call_stack[i], + demangled, addr_offset); + } else { + sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, + call_stack[i]); + } + } + free(symbols); + err_str_ = sout.str(); + } + } + + const char* what() const noexcept { return err_str_.c_str(); } +}; + +// Because most enforce conditions would evaluate to true, we can use +// __builtin_expect to instruct the C++ compiler to generate code that +// always forces branch prediction of true. +// This generates faster binary code. __builtin_expect is since C++11. +// For more details, please check https://stackoverflow.com/a/43870188/724872. +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) + +template +inline typename std::enable_if::type throw_on_error( + bool stat, const Args&... args) { + if (UNLIKELY(!(stat))) { + throw std::runtime_error(string::Sprintf(args...)); + } +} + +#ifdef PADDLE_WITH_CUDA + +template +inline typename std::enable_if::type throw_on_error( + cudaError_t e, const Args&... args) { + if (UNLIKELY(e)) { + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + curandStatus_t stat, const Args&... args) { + if (stat != CURAND_STATUS_SUCCESS) { + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + cudnnStatus_t stat, const Args&... args) { + if (stat == CUDNN_STATUS_SUCCESS) { + return; + } else { + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + cublasStatus_t stat, const Args&... args) { + std::string err; + if (stat == CUBLAS_STATUS_SUCCESS) { + return; + } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + err = "CUBLAS: not initialized, "; + } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { + err = "CUBLAS: alloc failed, "; + } else if (stat == CUBLAS_STATUS_INVALID_VALUE) { + err = "CUBLAS: invalid value, "; + } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { + err = "CUBLAS: arch mismatch, "; + } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { + err = "CUBLAS: mapping error, "; + } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { + err = "CUBLAS: execution failed, "; + } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { + err = "CUBLAS: internal error, "; + } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { + err = "CUBLAS: not supported, "; + } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { + err = "CUBLAS: license error, "; + } + throw std::runtime_error(err + string::Sprintf(args...)); +} + +template +inline typename std::enable_if::type throw_on_error( + ncclResult_t stat, const Args&... args) { + if (stat == ncclSuccess) { + return; + } else { + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + + string::Sprintf(args...)); + } +} + +#endif // PADDLE_ONLY_CPU + +template +inline void throw_on_error(T e) { + throw_on_error(e, ""); +} + +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + std::make_exception_ptr( \ + std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ + __FILE__, __LINE__); \ + } while (false) + +#define PADDLE_ENFORCE(...) \ + do { \ + try { \ + ::paddle::platform::throw_on_error(__VA_ARGS__); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ + } while (false) + +/* + * Some enforce helpers here, usage: + * int a = 1; + * int b = 2; + * PADDLE_ENFORCE_EQ(a, b); + * + * will raise an expression described as follows: + * "enforce a == b failed, 1 != 2" with detailed stack information. + * + * extra messages is also supported, for example: + * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) + */ + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) +#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ + do { \ + if (UNLIKELY(nullptr == (__VAL))) { \ + PADDLE_THROW(#__VAL " should not be null\n%s", \ + paddle::string::Sprintf("" __VA_ARGS__)); \ + } \ + } while (0) + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + do { \ + if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \ + PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \ + " %s\n%s", \ + #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \ + paddle::string::to_string(__VAL1), \ + paddle::string::Sprintf("" __VA_ARGS__)); \ + } \ + } while (0) + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..896a9a04eca80f22bebf87859d277458d9bdb092 --- /dev/null +++ b/paddle/fluid/platform/enforce_test.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/string/piece.h" + +using StringPiece = paddle::string::Piece; +using paddle::string::HasPrefix; + +TEST(ENFORCE, OK) { + PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345); + size_t val = 1; + const size_t limit = 10; + PADDLE_ENFORCE(val < limit, "Enforce is OK too"); +} + +TEST(ENFORCE, FAILED) { + bool caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE, NO_ARG_OK) { + int a = 2; + int b = 2; + PADDLE_ENFORCE_EQ(a, b); + // test enforce with extra message. + PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info"); +} + +TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) { + int a = 2; + bool caught_exception = false; + try { + PADDLE_ENFORCE_EQ(a, 1 + 3); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4"); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) { + int a = 2; + bool caught_exception = false; + try { + PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + HasPrefix(StringPiece(error.what()), + "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match"); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_NE, OK) { + PADDLE_ENFORCE_NE(1, 2); + PADDLE_ENFORCE_NE(1.0, 2UL); +} +TEST(ENFORCE_NE, FAIL) { + bool caught_exception = false; + + try { + // 2UL here to check data type compatible + PADDLE_ENFORCE_NE(1.0, 1UL); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE(HasPrefix(StringPiece(error.what()), + "enforce 1.0 != 1UL failed, 1 == 1")) + << error.what() << " does not have expected prefix"; + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); } +TEST(ENFORCE_GT, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_GE, OK) { + PADDLE_ENFORCE_GE(2, 2UL); + PADDLE_ENFORCE_GE(3, 2UL); + PADDLE_ENFORCE_GE(3, 2); + PADDLE_ENFORCE_GE(3.21, 2UL); +} +TEST(ENFORCE_GE, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_GE(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_LE, OK) { + PADDLE_ENFORCE_LE(1, 1); + PADDLE_ENFORCE_LE(1, 1UL); + PADDLE_ENFORCE_LE(2, 3UL); + PADDLE_ENFORCE_LE(2UL, 3); + PADDLE_ENFORCE_LE(2UL, 3.2); +} +TEST(ENFORCE_LE, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_LT, OK) { + PADDLE_ENFORCE_LT(3, 10); + PADDLE_ENFORCE_LT(2, 3UL); + PADDLE_ENFORCE_LT(2UL, 3); +} +TEST(ENFORCE_LT, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_LT(1UL, 0.12); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE(HasPrefix(StringPiece(error.what()), + "enforce 1UL < 0.12 failed, 1 >= 0.12")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_NOT_NULL, OK) { + int* a = new int; + PADDLE_ENFORCE_NOT_NULL(a); + delete a; +} +TEST(ENFORCE_NOT_NULL, FAIL) { + bool caught_exception = false; + try { + int* a = nullptr; + PADDLE_ENFORCE_NOT_NULL(a); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null")); + } + EXPECT_TRUE(caught_exception); +} + +struct Dims { + size_t dims_[4]; + + bool operator==(const Dims& o) const { + for (size_t i = 0; i < 4; ++i) { + if (dims_[i] != o.dims_[i]) return false; + } + return true; + } +}; + +std::ostream& operator<<(std::ostream& os, const Dims& d) { + for (size_t i = 0; i < 4; ++i) { + if (i == 0) { + os << "["; + } + os << d.dims_[i]; + if (i == 4 - 1) { + os << "]"; + } else { + os << ", "; + } + } + return os; +} + +TEST(ENFORCE_USER_DEFINED_CLASS, EQ) { + Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}}; + PADDLE_ENFORCE_EQ(a, b); +} + +TEST(ENFORCE_USER_DEFINED_CLASS, NE) { + Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}}; + ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h new file mode 100644 index 0000000000000000000000000000000000000000..0e695328c394cd5cd1a14f42b7c82e8899e2167b --- /dev/null +++ b/paddle/fluid/platform/for_range.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { + +template +struct ForRange { + ForRange(const DeviceContext& dev_ctx, size_t limit); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRange { + ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {} + + template + void operator()(Function func) const { + for (size_t i = 0; i < limit_; ++i) { + func(i); + } + } + + size_t limit_; +}; + +#ifdef __NVCC__ +template +__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { + size_t idx = static_cast(threadIdx.x); + func(idx); +} + +template +__global__ static void ForRangeElemwiseOp(Function func, int limit) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + func(idx); + } +} + +template <> +struct ForRange { + ForRange(const CUDADeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} + + template + inline void operator()(Function func) const { + constexpr int num_threads = 1024; + int block_size = limit_ <= num_threads ? limit_ : num_threads; + int grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const CUDADeviceContext& dev_ctx_; + int limit_; +}; + +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..1797f59a9c9731d28febe32d268d2b07073550eb --- /dev/null +++ b/paddle/fluid/platform/gpu_info.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/gpu_info.h" + +#include "gflags/gflags.h" + +#include "paddle/fluid/platform/enforce.h" + +DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, + "Default use 92% of GPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +int GetCUDADeviceCount() { + int count; + PADDLE_ENFORCE( + cudaGetDeviceCount(&count), + "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount"); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE( + cudaGetDevice(&device_id), + "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId"); + return device_id; +} + +void SetDeviceId(int id) { + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE(cudaSetDevice(id), + "cudaSetDevice failed in paddle::platform::SetDeviceId"); +} + +void GpuMemoryUsage(size_t &available, size_t &total) { + PADDLE_ENFORCE(cudaMemGetInfo(&available, &total), + "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); +} + +size_t GpuMaxAllocSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + // Reserve the rest for page tables, etc. + return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); +} + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t GpuMaxChunkSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" + << total / 1024 / 1024 << "M"; + size_t reserving = static_cast(0.05 * total); + // If available less than minimum chunk size, no usable memory exists. + available = + std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), + total - reserving); + + // Reserving the rest memory for page tables, etc. + + size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * + (total - reserving)); + + PADDLE_ENFORCE_LE(allocating, available); + + return allocating; +} + +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); +} + +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, cudaStream_t stream) { + PADDLE_ENFORCE( + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), + "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer"); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), + "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); +} +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h similarity index 100% rename from paddle/platform/gpu_info.h rename to paddle/fluid/platform/gpu_info.h diff --git a/paddle/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h similarity index 100% rename from paddle/platform/hostdevice.h rename to paddle/fluid/platform/hostdevice.h diff --git a/paddle/platform/macros.h b/paddle/fluid/platform/macros.h similarity index 100% rename from paddle/platform/macros.h rename to paddle/fluid/platform/macros.h diff --git a/paddle/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h similarity index 100% rename from paddle/platform/mkldnn_helper.h rename to paddle/fluid/platform/mkldnn_helper.h diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..75b95aff1a41dac70f1b732938c648ca55b2a973 --- /dev/null +++ b/paddle/fluid/platform/nccl_test.cu @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" + +static int dev_count = 0; + +namespace paddle { +namespace platform { + +TEST(NCCL, init) { + std::vector comms; + comms.resize(dev_count); + PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); + + for (int i = 0; i < dev_count; ++i) { + dynload::ncclCommDestroy(comms[i]); + } +} + +template +struct PerThreadData { + thrust::device_vector send_buff; + thrust::device_vector recv_buff; + CUDADeviceContext dev_ctx; + + T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); } + + T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } + + PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) { + send_buff.resize(size); + for (size_t i = 0; i < size; ++i) { + send_buff[i] = static_cast(i); + } + recv_buff.resize(size); + } +}; + +static constexpr int ELEM_COUNT = 10000; + +TEST(NCCL, all_reduce) { + std::vector comms; + comms.resize(dev_count); + VLOG(1) << "Initializing ncclComm"; + dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); + VLOG(1) << "ncclComm initialized"; + VLOG(1) << "Creating thread data"; + std::vector>> data; + data.reserve(dev_count); + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Creating thread data for device " << i; + SetDeviceId(i); + data.emplace_back(new PerThreadData(i, ELEM_COUNT)); + } + VLOG(1) << "Thread data created"; + + VLOG(1) << "Check send_buf data"; + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Check on device " << i; + SetDeviceId(i); + thrust::host_vector tmp = data[i]->send_buff; + for (size_t j = 0; j < tmp.size(); ++j) { + ASSERT_NEAR(static_cast(j), tmp[j], 1e-5); + } + } + + VLOG(1) << "Invoking ncclAllReduce"; + + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Invoking ncclAllReduce with device " << i; + SetDeviceId(i); + PADDLE_ENFORCE(dynload::ncclAllReduce( + data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble, + ncclSum, comms[i], data[i]->dev_ctx.stream())); + VLOG(1) << "Invoked ncclAllReduce for device " << i; + } + + VLOG(1) << "Invoked ncclAllReduce"; + + VLOG(1) << "Sync devices"; + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Sync device " << i; + SetDeviceId(i); + data[i]->dev_ctx.Wait(); + } + VLOG(1) << "device synced"; + + for (int i = 0; i < dev_count; ++i) { + SetDeviceId(i); + VLOG(1) << "Checking vector on device " << i; + thrust::host_vector tmp = data[i]->recv_buff; + for (size_t j = 0; j < tmp.size(); ++j) { + auto elem = static_cast(j); + elem *= dev_count; + ASSERT_NEAR(tmp[j], elem, 1e-4); + } + } + + for (int i = 0; i < dev_count; ++i) { + dynload::ncclCommDestroy(comms[i]); + } +} +} // namespace platform +} // namespace paddle + +int main(int argc, char** argv) { + // FIXME(tonyyang-svail): + // Due to the driver issue on our CI, disable for now + return 0; + dev_count = paddle::platform::GetCUDADeviceCount(); + if (dev_count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << dev_count; + return 0; + } + + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::CUDAPlace(i)); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc new file mode 100644 index 0000000000000000000000000000000000000000..e99b75d761abcf065070f463d578171797383cea --- /dev/null +++ b/paddle/fluid/platform/place.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +namespace detail { + +class PlacePrinter : public boost::static_visitor<> { + public: + explicit PlacePrinter(std::ostream &os) : os_(os) {} + void operator()(const CPUPlace &) { os_ << "CPUPlace"; } + void operator()(const CUDAPlace &p) { + os_ << "CUDAPlace(" << p.device << ")"; + } + + private: + std::ostream &os_; +}; + +} // namespace detail + +static Place the_default_place; + +void set_place(const Place &place) { the_default_place = place; } +const Place &get_place() { return the_default_place; } + +const CUDAPlace default_gpu() { return CUDAPlace(0); } +const CPUPlace default_cpu() { return CPUPlace(); } + +bool is_gpu_place(const Place &p) { + return boost::apply_visitor(IsCUDAPlace(), p); +} + +bool is_cpu_place(const Place &p) { return !is_gpu_place(p); } + +bool places_are_same_class(const Place &p1, const Place &p2) { + return p1.which() == p2.which(); +} + +bool is_same_place(const Place &p1, const Place &p2) { + if (places_are_same_class(p1, p2)) { + if (is_cpu_place(p1)) { + return true; + } else { + return boost::get(p1) == boost::get(p2); + } + } else { + return false; + } +} + +std::ostream &operator<<(std::ostream &os, const Place &p) { + detail::PlacePrinter printer(os); + boost::apply_visitor(printer, p); + return os; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h new file mode 100644 index 0000000000000000000000000000000000000000..2977a41036e84fc5aabd69c24cc9e62391b7dc38 --- /dev/null +++ b/paddle/fluid/platform/place.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace platform { + +struct CPUPlace { + // WORKAROUND: for some reason, omitting this constructor + // causes errors with boost 1.59 and OSX + CPUPlace() {} + + // needed for variant equality comparison + inline bool operator==(const CPUPlace &) const { return true; } + inline bool operator!=(const CPUPlace &) const { return false; } +}; + +struct CUDAPlace { + CUDAPlace() : CUDAPlace(0) {} + explicit CUDAPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const CUDAPlace &o) const { + return device == o.device; + } + inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } + + int device; +}; + +struct IsCUDAPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const CUDAPlace &gpu) const { return true; } +}; + +typedef boost::variant Place; + +using PlaceList = std::vector; + +void set_place(const Place &); +const Place &get_place(); + +const CUDAPlace default_gpu(); +const CPUPlace default_cpu(); + +bool is_gpu_place(const Place &); +bool is_cpu_place(const Place &); +bool places_are_same_class(const Place &, const Place &); +bool is_same_place(const Place &, const Place &); + +std::ostream &operator<<(std::ostream &, const Place &); + +template +struct PlaceVisitorWrapper + : public boost::static_visitor { + const Visitor &visitor_; + explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {} + + typename Visitor::result_type operator()(const CPUPlace &cpu) const { + return visitor_(cpu); + } + + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { +#ifdef PADDLE_WITH_CUDA + return visitor_(cuda); +#else + PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device"); + return typename Visitor::result_type(); +#endif + } +}; + +template +typename Visitor::result_type VisitPlace(const Place &place, + const Visitor &visitor) { + return boost::apply_visitor(PlaceVisitorWrapper(visitor), place); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f248902d91c1ffad1364a2f1078a41626b61ac22 --- /dev/null +++ b/paddle/fluid/platform/place_test.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/platform/place.h" +#include +#include "gtest/gtest.h" + +TEST(Place, Equality) { + paddle::platform::CPUPlace cpu; + paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); + + EXPECT_EQ(cpu, cpu); + EXPECT_EQ(g0, g0); + EXPECT_EQ(g1, g1); + EXPECT_EQ(g0, gg0); + + EXPECT_NE(g0, g1); + + EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0)); + EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu)); +} + +TEST(Place, Default) { + EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place())); + EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu())); + EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu())); + + EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place())); + paddle::platform::set_place(paddle::platform::CPUPlace()); + EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place())); +} + +TEST(Place, Print) { + { + std::stringstream ss; + ss << paddle::platform::CUDAPlace(1); + EXPECT_EQ("CUDAPlace(1)", ss.str()); + } + { + std::stringstream ss; + ss << paddle::platform::CPUPlace(); + EXPECT_EQ("CPUPlace", ss.str()); + } +} diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..28d2675f799a2d398d43dc31c550f0d84424116e --- /dev/null +++ b/paddle/fluid/platform/profiler.cc @@ -0,0 +1,346 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler.h" +#include +#include +#include "glog/logging.h" + +namespace paddle { +namespace platform { + +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// To record which timer the profiler used, CUDA or CPU. +static std::string g_profiler_place = ""; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr g_event_list; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +Event::Event(EventKind kind, std::string name, uint32_t thread_id, + const DeviceContext* dev_ctx) + : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { +#ifdef PADDLE_WITH_CUDA + has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; + if (has_cuda_) { + auto* cuda_dev_ctx = static_cast(dev_ctx); + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + } +#endif + cpu_ns_ = GetTimeInNsec(); +} + +std::string Event::kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); +} + +double Event::CpuElapsedMs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000000.0); +} + +double Event::CudaElapsedMs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif +} + +#ifdef PADDLE_WITH_CUDA +static void ForEachDevice(std::function func) { + auto original_device = GetCurrentDeviceId(); + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + func(i); + } + SetDeviceId(original_device); +} +#endif + +inline EventList& GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + } + return *g_event_list; +} + +void Mark(const std::string& name, const DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); +} + +void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); +} + +void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); +} + +RecordEvent::RecordEvent(const std::string& name, + const DeviceContext* dev_ctx) { + if (g_state == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + name_ = name; + PushEvent(name_, dev_ctx_); +} + +RecordEvent::~RecordEvent() { + if (g_state == ProfilerState::kDisabled) return; + PopEvent(name_, dev_ctx_); +} + +void EnableProfiler(ProfilerState state) { + PADDLE_ENFORCE(state != ProfilerState::kDisabled, + "Can't enbale profling, since the input state is ", + "ProfilerState::kDisabled"); + PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, + "The profiling state should be disabled when calling ", + "EnableProfiler."); + g_state = state; + g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; +#ifdef PADDLE_WITH_CUDA + if (g_state == ProfilerState::kCUDA) { + // Generate some dummy evenets first to reduce the startup overhead. + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); + Mark("_cuda_startup_", dev_ctx); + dev_ctx->Wait(); + delete dev_ctx; + }); + } + } +#endif + // Mark the profiling start. + Mark("_start_profiler_", nullptr); +} + +void ResetProfiler() { + std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + (*it)->Clear(); + } +} + +std::vector> GetAllEvents() { + std::lock_guard guard(g_all_event_lists_mutex); + std::vector> result; + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); + } + return result; +} + +void DisableProfiler(EventSortingKey sorted_key) { + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, + "Can't disable profiling, since it's not starting."); + // Mark the profiling stop. + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; + + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, sorted_key); + ResetProfiler(); +} + +void ParseEvents(std::vector>& events, + EventSortingKey sorted_by) { + if (g_profiler_place == "") return; + + std::string sorted_domain; + std::function sorted_func; + switch (sorted_by) { + case EventSortingKey::kCalls: + sorted_domain = "number of calls"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.calls > b.calls; + }; + break; + case EventSortingKey::kTotal: + sorted_domain = "total time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.total_time > b.total_time; + }; + break; + case EventSortingKey::kMin: + sorted_domain = "minimum time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.min_time > b.min_time; + }; + break; + case EventSortingKey::kMax: + sorted_domain = "maximum time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.max_time > b.max_time; + }; + break; + case EventSortingKey::kAve: + sorted_domain = "average time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.ave_time > b.ave_time; + }; + break; + default: + sorted_domain = "event first end time"; + } + + std::vector> events_table; + size_t max_name_width = 0; + for (size_t i = 0; i < events.size(); i++) { + std::list pushed_events; + std::vector event_items; + std::unordered_map event_idx; + + for (size_t j = 0; j < events[i].size(); j++) { + if (events[i][j].kind() == "push") { + pushed_events.push_back(events[i][j]); + } else if (events[i][j].kind() == "pop") { + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend() && + rit->name() != events[i][j].name()) { + ++rit; + } + + if (rit != pushed_events.rend()) { + double event_time = (g_profiler_place == "CUDA") + ? rit->CudaElapsedMs(events[i][j]) + : rit->CpuElapsedMs(events[i][j]); + std::string event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + + if (event_idx.find(event_name) == event_idx.end()) { + event_idx[event_name] = event_items.size(); + EventItem event_item = {event_name, 1, event_time, + event_time, event_time, event_time}; + event_items.push_back(event_item); + } else { + int index = event_idx[event_name]; + event_items[index].calls += 1; + // total time + event_items[index].total_time += event_time; + // min time + event_items[index].min_time = + std::min(event_time, event_items[index].min_time); + // max time + event_items[index].max_time = + std::max(event_time, event_items[index].max_time); + } + + // remove the push marker from the list + pushed_events.erase((++rit).base()); + } else { + LOG(WARNING) << "Cannot find the push marker of event \'" + << events[i][j].name() + << "\', which will be ignored in profiling report."; + } + } + } + // average time + for (auto& item : event_items) { + item.ave_time = item.total_time / item.calls; + } + // sort + if (sorted_by != EventSortingKey::kDefault) { + std::sort(event_items.begin(), event_items.end(), sorted_func); + } + + events_table.push_back(event_items); + // log warning if there are events with `push` but without `pop` + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend()) { + LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() + << "\', which will be ignored in profiling report."; + ++rit; + } + } + + // Print report + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); +} + +void PrintProfiler(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::cout << "Place: " << g_profiler_place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::cout << "Sorted by " << sorted_domain + << " in descending order in the same thread\n\n"; + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total" + << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." << std::endl; + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + EventItem& event_item = events_table[i][j]; + std::cout << std::setw(name_width) << event_item.name + << std::setw(data_width) << event_item.calls + << std::setw(data_width) << event_item.total_time + << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.max_time + << std::setw(data_width) << event_item.ave_time << std::endl; + } + } + std::cout << std::endl; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..0bc5e666cb4f28b99169435cb3dd52829c35a2c2 --- /dev/null +++ b/paddle/fluid/platform/profiler.h @@ -0,0 +1,150 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { + +enum EventKind { kMark, kPushRange, kPopRange }; + +class Event { + public: + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. + Event(EventKind kind, std::string name, uint32_t thread_id, + const DeviceContext* dev_ctx); + + std::string kind() const; + std::string name() const { return name_; } + uint32_t thread_id() const { return thread_id_; } + bool has_cuda() const { return has_cuda_; } + +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event() const { return event_; } + int device() const { return device_; } +#endif + + double CpuElapsedMs(const Event& e) const; + double CudaElapsedMs(const Event& e) const; + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; + bool has_cuda_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + void Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + void Clear() { event_blocks.clear(); } + + std::forward_list> event_blocks; +}; + +enum ProfilerState { + kDisabled, // disabled state + kCPU, // CPU profiling state + kCUDA, // GPU profiling state +}; + +void Mark(const std::string& name, const DeviceContext* dev_ctx); + +void PushEvent(const std::string& name, const DeviceContext* dev_ctx); + +void PopEvent(const std::string& name, const DeviceContext* dev_ctx); + +struct RecordEvent { + explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx); + + ~RecordEvent(); + + // The device context is used by Event to get the current cuda stream. + const DeviceContext* dev_ctx_; + // Event name + std::string name_; +}; + +// Return the event list of all threads. Asummed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. +std::vector> GetAllEvents(); + +// The information of each event given in the profiling report +struct EventItem { + std::string name; + int calls; + double total_time; + double min_time; + double max_time; + double ave_time; +}; + +// Candidate keys to sort the profiling report +enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; + +// Enable the profiling function. +void EnableProfiler(ProfilerState state); + +// Clear the g_all_event_lists, which is total event lists of all threads. +void ResetProfiler(); + +void DisableProfiler(EventSortingKey sorted_key); + +// Parse the event list and output the profiling report +void ParseEvents(std::vector>&, + EventSortingKey sorted_by = EventSortingKey::kDefault); + +// Print results +void PrintProfiler(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width); + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2525c38b6fb98ea9bac49f7eb28e755bd7fa9a2 --- /dev/null +++ b/paddle/fluid/platform/profiler_test.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler.h" +#include "gtest/gtest.h" + +TEST(Event, CpuElapsedTime) { + using paddle::platform::Event; + using paddle::platform::EventKind; + + Event start_event(EventKind::kPushRange, "test", 0, nullptr); + EXPECT_TRUE(start_event.has_cuda() == false); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, nullptr); + EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(Event, CudaElapsedTime) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + using paddle::platform::Event; + using paddle::platform::EventKind; + + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); + Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); + EXPECT_TRUE(start_event.has_cuda() == true); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); + EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); +} +#endif + +TEST(RecordEvent, RecordEvent) { + using paddle::platform::DeviceContext; + using paddle::platform::Event; + using paddle::platform::EventKind; + using paddle::platform::RecordEvent; + using paddle::platform::ProfilerState; + using paddle::platform::EventSortingKey; + + ProfilerState state = ProfilerState::kCPU; + DeviceContext* dev_ctx = nullptr; +#ifdef PADDLE_WITH_CUDA + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + state = ProfilerState::kCUDA; + dev_ctx = + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); +#endif + EnableProfiler(state); + + /* Usage 1: + * PushEvent(evt_name, dev_ctx); + * ... + * code to be analyzed + * ... + * PopEvent(evt_name, dev_ctx); + */ + for (int loop = 0; loop < 3; ++loop) { + for (int i = 1; i < 5; ++i) { + std::string name = "op_" + std::to_string(i); + PushEvent(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + PopEvent(name, dev_ctx); + } + } + + /* Usage 2: + * { + * RecordEvent record_event(name, dev_ctx); + * ... + * code to be analyzed + * ... + * } + */ + for (int i = 1; i < 5; ++i) { + std::string name = "evs_op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + } + + // Bad Usage: + PushEvent("event_without_pop", dev_ctx); + PopEvent("event_without_push", dev_ctx); + std::vector> events = paddle::platform::GetAllEvents(); + + int cuda_startup_count = 0; + int start_profiler_count = 0; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; + if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; + if (events[i][j].name() == "push") { + EXPECT_EQ(events[i][j + 1].name(), "pop"); +#ifdef PADDLE_WITH_CUDA + EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); +#else + EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); +#endif + } + } + } + EXPECT_EQ(cuda_startup_count % 5, 0); + EXPECT_EQ(start_profiler_count, 1); + + // Will remove parsing-related code from test later + DisableProfiler(EventSortingKey::kTotal); +} diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h new file mode 100644 index 0000000000000000000000000000000000000000..879daed19102c85cc5ea03933f8324023cec0fe2 --- /dev/null +++ b/paddle/fluid/platform/transform.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#ifdef __NVCC__ +#include +#include +#include "paddle/fluid/platform/details/device_ptr_cast.h" +#endif + +namespace paddle { +namespace platform { + +// Transform on host or device. It provides the same API in std library. +template +struct Transform { + template + void operator()(const DeviceContext& context, InputIter first, InputIter last, + OutputIter result, UnaryOperation op); + + template + void operator()(const DeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op); +}; + +template <> +struct Transform { + template + void operator()(const platform::CPUDeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { + std::transform(first, last, result, op); + } + + template + void operator()(const platform::CPUDeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op) { + std::transform(first1, last1, first2, result, op); + } +}; + +#ifdef __NVCC__ +template <> +struct Transform { + template + void operator()(const platform::CUDADeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { + auto place = context.GetPlace(); + PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); + thrust::transform(thrust::cuda::par.on(context.stream()), + details::DevPtrCast(first), details::DevPtrCast(last), + details::DevPtrCast(result), op); + } + + template + void operator()(const platform::CUDADeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op) { + auto place = context.GetPlace(); + PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); + thrust::transform(thrust::cuda::par.on(context.stream()), + details::DevPtrCast(first1), details::DevPtrCast(last1), + details::DevPtrCast(first2), details::DevPtrCast(result), + op); + } +}; +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e4b9edc2fd45e9c00f5339948172f6267210363 --- /dev/null +++ b/paddle/fluid/platform/transform_test.cu @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/transform.h" + +template +class Scale { + public: + explicit Scale(const T& scale) : scale_(scale) {} + + HOSTDEVICE T operator()(const T& a) const { return a * scale_; } + + private: + T scale_; +}; + +template +class Multiply { + public: + HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } +}; + +TEST(Transform, CPUUnary) { + using namespace paddle::platform; + CPUDeviceContext ctx; + float buf[4] = {0.1, 0.2, 0.3, 0.4}; + Transform trans; + trans(ctx, buf, buf + 4, buf, Scale(10)); + for (int i = 0; i < 4; ++i) { + ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); + } +} + +TEST(Transform, GPUUnary) { + using namespace paddle::platform; + using namespace paddle::memory; + CUDAPlace gpu0(0); + CUDADeviceContext ctx(gpu0); + float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; + float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); + Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); + Transform trans; + trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); + ctx.Wait(); + Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); + Free(gpu0, gpu_buf); + for (int i = 0; i < 4; ++i) { + ASSERT_NEAR(cpu_buf[i], static_cast(i + 1), 1e-5); + } +} + +TEST(Transform, CPUBinary) { + using namespace paddle::platform; + using namespace paddle::memory; + int buf[4] = {1, 2, 3, 4}; + Transform trans; + CPUDeviceContext ctx; + trans(ctx, buf, buf + 4, buf, buf, Multiply()); + for (int i = 0; i < 4; ++i) { + ASSERT_EQ((i + 1) * (i + 1), buf[i]); + } +} + +TEST(Transform, GPUBinary) { + using namespace paddle::platform; + using namespace paddle::memory; + int buf[4] = {1, 2, 3, 4}; + CUDAPlace gpu0(0); + CUDADeviceContext ctx(gpu0); + int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); + Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); + Transform trans; + trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); + ctx.Wait(); + Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); + Free(gpu0, gpu_buf); + for (int i = 0; i < 4; ++i) { + ASSERT_EQ((i + 1) * (i + 1), buf[i]); + } +} diff --git a/paddle/platform/variant.h b/paddle/fluid/platform/variant.h similarity index 100% rename from paddle/platform/variant.h rename to paddle/fluid/platform/variant.h diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115 --- /dev/null +++ b/paddle/fluid/pybind/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d62f34030894e2fa21925bbc44e24b4e7d738d15 --- /dev/null +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -0,0 +1,9 @@ +if(WITH_PYTHON) + cc_library(paddle_pybind SHARED + SRCS pybind.cc exception.cc protobuf.cc const_value.cc + DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + ${GLOB_OP_LIB}) + if(NOT APPLE AND NOT ANDROID) + target_link_libraries(paddle_pybind rt) + endif(NOT APPLE AND NOT ANDROID) +endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc new file mode 100644 index 0000000000000000000000000000000000000000..098252a83d3b7e2926bf737ce7f2b3794046f28f --- /dev/null +++ b/paddle/fluid/pybind/const_value.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "const_value.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace pybind { + +void BindConstValue(pybind11::module& m) { + m.def("kEmptyVarName", [] { return framework::kEmptyVarName; }); + m.def("kTempVarName", [] { return framework::kTempVarName; }); + m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); + m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h new file mode 100644 index 0000000000000000000000000000000000000000..67d14ac9ff01d1754dd8dd165b638db12c9d0ea0 --- /dev/null +++ b/paddle/fluid/pybind/const_value.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" +#include "pybind11/pybind11.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { +extern void BindConstValue(pybind11::module& m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc new file mode 100644 index 0000000000000000000000000000000000000000..7398a88541bcbf338ca9568595d0dc7b16eff118 --- /dev/null +++ b/paddle/fluid/pybind/exception.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/exception.h" + +namespace paddle { +namespace pybind { + +void BindException(pybind11::module& m) { + static pybind11::exception exc(m, "EnforceNotMet"); + pybind11::register_exception_translator([](std::exception_ptr p) { + try { + if (p) std::rethrow_exception(p); + } catch (const platform::EnforceNotMet& e) { + exc(e.what()); + } + }); + + m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h new file mode 100644 index 0000000000000000000000000000000000000000..43e91a706300e561a20d88abe80d9b6654bd2171 --- /dev/null +++ b/paddle/fluid/pybind/exception.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" +#include "pybind11/pybind11.h" +namespace paddle { +namespace pybind { + +extern void BindException(pybind11::module& m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc new file mode 100644 index 0000000000000000000000000000000000000000..4aefcf1a1cdcb3ad2408ded2f76a69570eccb41d --- /dev/null +++ b/paddle/fluid/pybind/protobuf.cc @@ -0,0 +1,297 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/protobuf.h" +#include +#include +#include "paddle/fluid/framework/backward.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +// Cast boost::variant for PyBind. +// Copy from +// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 +namespace pybind11 { +namespace detail { + +// Can be replaced by a generic lambda in C++14 +struct variant_caster_visitor : public boost::static_visitor { + return_value_policy policy; + handle parent; + + variant_caster_visitor(return_value_policy policy, handle parent) + : policy(policy), parent(parent) {} + + template + handle operator()(T const &src) const { + return make_caster::cast(src, policy, parent); + } +}; + +template +struct variant_caster; + +template