diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md new file mode 100644 index 0000000000000000000000000000000000000000..57708855dce4fcc81e719c59082a8a42415eba47 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---feature-request-.md @@ -0,0 +1,27 @@ +--- +name: 建议(Feature request) +about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue. + +--- + +欢迎您对PaddlePaddle提出建议,非常感谢您对PaddlePaddle的贡献! +在留下您的建议时,辛苦您同步提供如下信息: +- 版本、环境信息 +1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1 +2)CPU/GPU:您是否使用GPU进行训练,如是,请提供您的CUDA和cuDNN版本号 +3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 建议描述:请您详细描述,您认为需优化的功能 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +Please make sure that this is a feature request. +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +**To Reproduce** +Steps to reproduce the behavior +**Describe the feature and the current behavior/state.** +**Any Other info.** diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md new file mode 100644 index 0000000000000000000000000000000000000000..37bdc8889e2722dda964ba82c2ac36cef5e60110 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md @@ -0,0 +1,40 @@ +--- +name: 预测(Inference Issue) +about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准描述您的问题,例如“最新预测库的API文档在哪儿 ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本(如Mac OS 10.14),Python版本 +-预测信息 +    1)C++预测:请您提供预测库安装包的版本信息,及其中的version.txt文件 +    2)CMake包含路径的完整命令 +    3)API信息(如调用请提供) +    4)预测库来源:官网下载/特殊环境(如BCLOUD编译) +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that th +If there is no solution,please make sure that this is an inference issue including the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Cmake orders +-C++version.txt +-API information +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md new file mode 100644 index 0000000000000000000000000000000000000000..ce4ba589324673baa4aa39760bcdcd66ecfdd36b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md @@ -0,0 +1,40 @@ +--- +name: 安装(Installation Issue) +about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation +  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:请包含关键词“安装错误”/“编译错误”,例如“Mac编译错误” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(如Mac OS 10.14)、Python版本 +- 安装方式信息: +1)pip安装/docker安装 +2)本地编译:请提供cmake命令,编译命令 +3)docker编译:请提供docker镜像,编译命令            +  特殊环境请注明:如离线安装等 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is an installation issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg. Mac OS 10.14) +-Python version +- Install method: pip install/install with docker/build from source(without docker)/build within docker +- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md new file mode 100644 index 0000000000000000000000000000000000000000..7cb52f37b90262d2704fe32d8093fac19ed74b5f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---model-issue-.md @@ -0,0 +1,36 @@ +--- +name: 模型(Model Issue) +about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/ + algorithm/dataset  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错  ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(例如Mac OS 10.14),Python版本 +- 模型信息 +    1)模型名称 2)使用数据集名称 3)使用算法名称 4)模型链接 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a issue of models including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Name of Models&Dataset/details of operator +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md new file mode 100644 index 0000000000000000000000000000000000000000..6a291153e43f51e43646a1c860ec253361b137d6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---others-.md @@ -0,0 +1,33 @@ +--- +name: 其他(Others) +about: 如上述分类未包含您的问题,可在此提出。 You could use this template for reporting other issues + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题 +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU/GPU:如果您使用GPU训练,请提供GPU驱动版本、CUDA和cuDNN版本号 +    3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +    4)Python版本号 +    5)显存信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please provide us with the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/cuDNN version +-OS Platform and Distribution(eg.Mac OS 10.14) +-Python version +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md new file mode 100644 index 0000000000000000000000000000000000000000..29e8383d9779229328c7c12d04ba6173fd0c8ba1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---training-issue-.md @@ -0,0 +1,38 @@ +--- +name: 训练(Training issue) +about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training +  issue. + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本 +- 训练信息 +    1)单机/多机,单卡/多卡 +    2)显存信息 +    3)Operator信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a training issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Other imformation: Distriuted training/informantion of operator/ +Graphics card storage +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/CMakeLists.txt b/CMakeLists.txt index bc2ac2cd939690456930d78ed0bda39dd0953173..8dcf9786e36fa8376720c5bac6417ecbd04b27f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,7 +204,9 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) +include(external/rocprim) include(external/xxhash) # download xxhash +include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream diff --git a/Dockerfile b/Dockerfile index c8b9eed6d60e5d3b32fc14c0c7af80a785145d1b..84e1edbee91b0c3877ec7c55161af8c3bdf1c0dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,29 @@ ENV HOME /root # Add bash enhancements COPY ./paddle/scripts/docker/root/ /root/ +# Prepare packages for Python +RUN apt-get update && \ + apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ + libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev + +# Install Python3.6 +RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ + tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ + ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ + wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ + tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +# Install Python3.7 +RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ + tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +RUN rm -r /root/python_build + RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ python3 python3-dev python3-pip \ @@ -74,6 +97,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 RUN pip3 install -U wheel && \ pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 install -U wheel && \ + pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 install -U wheel && \ + pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ pip install -U pip setuptools wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ @@ -82,22 +111,34 @@ RUN pip3 install -U wheel && \ RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 install opencv-python && \ + pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 install opencv-python && \ + pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 install opencv-python && \ pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install opencv-python #For docstring checker RUN pip3 install pylint pytest astroid isort +RUN pip3.6 install pylint pytest astroid isort +RUN pip3.7 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ RUN pip3 install -r /root/requirements.txt +RUN pip3.6 install -r /root/requirements.txt +RUN pip3.7 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev RUN pip3 install certifi urllib3[secure] +RUN pip3.6 install certifi urllib3[secure] +RUN pip3.7 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 964d5fd45b350db2e5948574f53a427e53484ff4..414e92eb27f56e0670e1977e67c2f5ca9c6bbcc2 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -199,8 +199,11 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() else(NOT WIN32) +list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") elseif(CMAKE_BUILD_TYPE STREQUAL "Release") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") else() diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake new file mode 100644 index 0000000000000000000000000000000000000000..94d8fcc66855627d665b8e84a47a2075e7253b03 --- /dev/null +++ b/cmake/external/dlpack.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack) +set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include) + +include_directories(${DLPACK_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dlpack + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/dmlc/dlpack.git" + GIT_TAG "v0.2" + PREFIX ${DLPACK_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(dlpack STATIC ${dummyfile}) +else() + add_library(dlpack INTERFACE) +endif() + +add_dependencies(dlpack extern_dlpack) + +LIST(APPEND externl_project_dependencies dlpack) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f06a93f38f24c6a8af3b45767e93a1a4..6aef97f21244efd09e22781f703553a19a9e1860 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -17,7 +17,7 @@ if(WITH_AMD_GPU) extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 785148d4f9f44032e2ce5bf93f0dc80fc865808b..b280db23b9b27bc658a79d01ea81122d2c987666 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -53,7 +53,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "21fb5f2af1dd14e132af4f1b79160977ee487818" + GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index c885877a2bcd622dc8d1f3ca2821a0bed84561bf..3a10ea945d3d1a3fa88954196905872b2668d5a0 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -26,7 +26,7 @@ ExternalProject_Add( extern_pybind ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/pybind/pybind11.git" - GIT_TAG "v2.1.1" + GIT_TAG "v2.2.4" PREFIX ${PYBIND_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake new file mode 100644 index 0000000000000000000000000000000000000000..914c06491890574bcdf4374d8e0fd5498e780113 --- /dev/null +++ b/cmake/external/rocprim.cmake @@ -0,0 +1,44 @@ +if (NOT WITH_AMD_GPU) + return() +endif() + +# rocprim is "ROCm Parallel Primitives" for short. +# It is a header-only library providing HIP and HC parallel primitives +# for developing performant GPU-accelerated code on AMD ROCm platform. + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +INCLUDE(ExternalProject) + +SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim) +SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim) +SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_rocprim + GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git" + GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc + PREFIX ${ROCPRIM_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc + CMAKE_ARGS -DONLY_INSTALL=ON + CMAKE_ARGS -DBUILD_TEST=OFF + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR} + + INSTALL_DIR ${ROCPRIM_INSTALL_DIR} + ${EXTERNAL_PROJECT_LOG_ARGS} +) + +INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR}) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";") + add_library(rocprim STATIC ${dummyfile}) +else() + add_library(rocprim INTERFACE) +endif() + +add_dependencies(rocprim extern_rocprim) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 343e44ab4bc21c1a656048b675062f1b897bbc77..c4472040cef870454c072c1b84a04e1ac592b476 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -129,6 +129,9 @@ set(COMMON_FLAGS -Wno-error=parentheses-equality # Warnings in pybind11 -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 -Wno-error=terminate # Warning in PADDLE_ENFORCE + -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 + -Wimplicit-fallthrough=0 # Warning in tinyformat.h + -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 ) set(GPU_COMMON_FLAGS diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 111627a932afe2642312befb5d9c65e36e4bf1d0..312fbaa0b3d83c37debe78be82503103eabc0bfa 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -349,10 +349,17 @@ function(cc_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WIN32) + list(APPEND win32_deps shlwapi) + if("${cc_test_DEPS};" MATCHES "python;") + list(REMOVE_ITEM cc_test_DEPS python) + list(APPEND win32_deps ${PYTHON_LIBRARIES}) + endif() + endif(WIN32) add_executable(${TARGET_NAME} ${cc_test_SRCS}) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) if(WIN32) - target_link_libraries(${TARGET_NAME} shlwapi) + target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_test(NAME ${TARGET_NAME} @@ -454,25 +461,29 @@ function(hip_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) - target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a) - find_fluid_modules(${TARGET_NAME}) + target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so) + find_fluid_modules(${TARGET_NAME}) endif() - if (hip_library_DEPS) - add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() endforeach() else(hip_library_SRCS) if (hip_library_DEPS) - merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) + merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) else() - message(FATAL "Please specify source file or library in nv_library.") + message(FATAL "Please specify source file or library in nv_library.") endif() endif(hip_library_SRCS) endif() @@ -679,7 +690,7 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bfe491bd6b7602959d3dd60bd06c67993593cc9b..4276bc5b08cd88a52bb5782bca87fc37deabd830 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU) endif() include_directories("/opt/rocm/include") +include_directories("/opt/rocm/hip/include") +include_directories("/opt/rocm/miopen/include") include_directories("/opt/rocm/hipblas/include") include_directories("/opt/rocm/hiprand/include") include_directories("/opt/rocm/rocrand/include") @@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust") list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" ) +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) if(WITH_DSO) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") endif(WITH_DSO) -if(WITH_DOUBLE) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE") -endif(WITH_DOUBLE) - if(WITH_TESTING) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") endif(WITH_TESTING) +if(WITH_DISTRIBUTE) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") +endif(WITH_DISTRIBUTE) + +if(WITH_GRPC) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") +endif(WITH_GRPC) + +if(NOT WITH_GOLANG) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG") +endif(NOT WITH_GOLANG) + +if(WITH_MKLDNN) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") +endif(WITH_MKLDNN) + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") + +if(NOT WITH_RDMA) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA") +endif(NOT WITH_RDMA) + + + if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 17107e0698757997854e4627d30de60d9a9df11b..89726bf9859e71ee04c2f9380554090845fd44e5 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -109,7 +109,8 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" -"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") +"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" +"fusion_transpose_flatten_concat_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index ba90165beab0e02c80a4eeaacbf265b08a806854..e06d3459dac44b4739b833dcd9d6c03ea61f1af3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,10 +26,10 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None -paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None -paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None -paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) @@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) -paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None)) +paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 43e1bc6b2efecd24d2d5bade5b6a7727bfb0a607..52946c7f11f90490b1af1347f20db236a8fe24af 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -116,8 +116,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) +cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) + shape_inference data_transform lod_tensor profiler transfer_scope_cache) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -192,3 +193,6 @@ cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) + +cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) +cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index b8690156763e4037811245b8016982710445e6a2..a003995ae3f8e111881b4681554aa8eb17b60cc1 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -23,7 +23,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() { } if (platform::is_gpu_place(lod_tensors[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); int dtype = -1; size_t numel = 0; diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index f6ef3a1367b91b6abf8ce74a91f73056efd0f84e..b449796fcaee73a6b84e0db2b5c76ff94bedcf08 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace framework { namespace details { struct AllReduceOpHandle : public OpHandleBase { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); @@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase { private: std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; #endif }; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 8e5e5427659387d63eac21a200c1a20da493e539..d98df3bbadd391d6df9b0a65a063e43e07a06fbc 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) VarHandle *out_handle = nullptr; int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 72180fac864256ddda076c57e50ab1083c113d32..0c75e05f861636565ae855ddd534c1082d40d237 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -34,7 +34,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) @@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 4305eb65733a7c871450949ce2c48cab013bac81..df3b3cc9ca012eabc428a7fb4c3af9be5b1c5bd5 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -42,7 +42,7 @@ struct TestBroadcastOpHandle { std::vector> nodes_; std::vector place_list_; bool use_gpu_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif @@ -50,7 +50,7 @@ struct TestBroadcastOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -60,7 +60,7 @@ struct TestBroadcastOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -84,7 +84,7 @@ struct TestBroadcastOpHandle { place_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_.reset(nullptr); #endif } @@ -106,14 +106,14 @@ struct TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 37202f869508c283e1b464942cadc0ebe3eef39c..70baced0ada33c23ba05cd2722e607edf847585a 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -96,7 +96,7 @@ std::unique_ptr BuildStrategy::Apply( const std::string &loss_var_name, const std::unordered_set ¶m_names, const std::vector &local_scopes, -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else const bool use_cuda) const { @@ -118,7 +118,7 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index fc2641dbd48274b43db0b1f156e3e1128f96772e..3236c35efdbf1175c3d06e531fc551f202ae17ad 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -23,7 +23,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -98,7 +98,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::unordered_set ¶m_names, const std::vector &local_scopes, -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; #else const bool use_cuda) const; diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 0b772f9b63e2cfb78175f5e0d7011db8e6a5ec20..cc562c7b102cea80e18cbd2c054c34415a7442c9 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -20,7 +20,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) DataBalanceOpHandle::DataBalanceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h index 0462fb6ec713eb977f420a9cb485c0273e782496..2db18a1a7203f85aac6338576f2e68c7b37d7c69 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.h +++ b/paddle/fluid/framework/details/data_balance_op_handle.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace details { struct DataBalanceOpHandle : public OpHandleBase { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index e37259526a5f6f57d51a0ca8bca96a18211a4790..e43d545c9c0d062a8814a2f5d27f9ca20290b9dd 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -25,7 +25,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index 541993c74332cc483a8b854a6b8f227c7c9a19a9..be0d941c4f9c2fe8fbb1da8ec2c11868112fcf9b 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not supported."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8c98b781301e884d5d5c7d141f3d901d74d51285..26666212ae8c4dc5ce9a45b5c51bab1f9ff1a8ab 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const { places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); strategy_ = Get(kStrategy); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_ = &Get("nccl_ctxs"); #endif @@ -431,7 +431,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } bool use_gpu = false; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) use_gpu = nccl_ctxs_ != nullptr; #endif @@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { void MultiDevSSAGraphBuilder::SetCommunicationContext( OpHandleBase *op_handle, const platform::Place &p) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_ == nullptr) { op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); @@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ir::Graph *result, const std::vector &datas) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); @@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index f3ec2d29415240b7012f458070223469d0947166..8e462aec7dc7ce45cad592b89de0b6edde8c9146 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t device_id) const; void Init() const; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) mutable platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 4503123eac810917cabcf1e62cff98552ed2f742..c9f1107aeab5a21d46e828308cfcb2dde827cba6 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 999828ae457ba43541da06088ce7c25331fd05ec..846839029ca65be1bdeac2f6ea497db07a01b6cf 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 72299c0bfa916d3b92e1c5020ddd69dadad3701d..6cee4770e64354cf8a719b0e11b1816b345dd8bd 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -35,7 +35,7 @@ struct TestReduceOpHandle { std::vector gpu_list_; std::vector> ctxs_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif @@ -43,7 +43,7 @@ struct TestReduceOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -53,7 +53,7 @@ struct TestReduceOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -77,7 +77,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_.reset(nullptr); #endif } @@ -99,14 +99,14 @@ struct TestReduceOpHandle { nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..04e3f78afe44bf748e4514fd82e5571ce2a50838 --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" + +namespace paddle { +namespace framework { + +namespace internal { +template +static ::DLDataType GetDLDataTypeCode() { + ::DLDataType dtype; + if (std::is_same::value || + std::is_floating_point::value) { + dtype.code = kDLFloat; + } else if (std::is_unsigned::value) { + dtype.code = kDLUInt; + } else if (std::is_integral::value) { + dtype.code = kDLInt; + } else { + PADDLE_THROW("Unsupported data type %s", typeid(T).name()); + } + dtype.bits = 8 * sizeof(T); + dtype.lanes = 1; + return dtype; +} + +static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { +#define REG_DL_DATA_TYPE(type) \ + { std::type_index(typeid(type)), GetDLDataTypeCode() } + static const std::unordered_map + type_to_dtype_map({ + REG_DL_DATA_TYPE(platform::float16), // NOLINT + REG_DL_DATA_TYPE(float), // NOLINT + REG_DL_DATA_TYPE(double), // NOLINT + REG_DL_DATA_TYPE(int), // NOLINT + REG_DL_DATA_TYPE(int64_t), // NOLINT + REG_DL_DATA_TYPE(bool), // NOLINT + REG_DL_DATA_TYPE(size_t), // NOLINT + REG_DL_DATA_TYPE(int16_t), // NOLINT + REG_DL_DATA_TYPE(uint8_t), // NOLINT + REG_DL_DATA_TYPE(int8_t) // NOLINT + }); + static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); + auto it = type_to_dtype_map.find(type); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", + type.name()); + return it->second; +#undef REG_DL_DATA_TYPE +} + +struct DLContextVisitor : public boost::static_visitor<::DLContext> { + inline ::DLContext operator()(const platform::CPUPlace &place) const { + DLContext ctx; + ctx.device_type = kDLCPU; + ctx.device_id = 0; + return ctx; + } + + inline ::DLContext operator()(const platform::CUDAPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLGPU; + ctx.device_id = place.device; + return ctx; +#else + PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version"); +#endif + } + + inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLCPUPinned; + ctx.device_id = 0; + return ctx; +#else + PADDLE_THROW( + "platform::CUDAPinnedPlace is not supported in CPU only version"); +#endif + } +}; +} // namespace internal + +DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { + // init data, data buffer + t_.data = const_cast(tensor.data()); + + // init ctx, DLContext type with device_type and device_id + auto place = tensor.place(); + t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + + // init dtype + t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); + t_.dtype.lanes = lanes; + + // init ndim, tensor rank + auto &dims = tensor.dims(); + using DimType = decltype(t_.ndim); // int + t_.ndim = static_cast(dims.size()); + + // init shape, tensor dims + t_.shape = shape_; + for (DimType i = 0; i < t_.ndim; ++i) { + t_.shape[i] = dims[i]; + } + + // init strides, nullptr means the tensor is compact + t_.strides = nullptr; + + // init byte_offset + t_.byte_offset = 0; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..0c52bce1ef6af9b92bcb9f87c6781de878ed5898 --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class DLPackTensor { + public: + using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t + using ShapeType = + std::remove_reference::type; // int64_t + + // lanes is only used in CPU to enable vectorization + explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1); + + inline operator const ::DLTensor&() const { return t_; } + + inline operator ::DLTensor&() { return t_; } + + private: + ::DLTensor t_; + + // The shape in DLTensor is defined as int64_t* + // Add this member to make TVMTensor init without heap allocation + ShapeType shape_[9]; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..938b05635004fcc417f753d5912269333e3ebc01 --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +namespace { // NOLINT +template +constexpr uint8_t GetDLDataTypeCode() { + return std::is_same::value || + std::is_floating_point::value + ? static_cast(kDLFloat) + : (std::is_unsigned::value + ? static_cast(kDLUInt) + : (std::is_integral::value ? static_cast(kDLInt) + : static_cast(-1))); +} +} // NOLINT + +template +void TestMain(const platform::Place &place, uint16_t lanes) { + DDim dims{4, 5, 6, 7}; + Tensor tensor; + tensor.Resize(dims); + void *p = tensor.mutable_data(place); + + DLPackTensor dlpack_tensor(tensor, lanes); + ::DLTensor &dl_tensor = dlpack_tensor; + + CHECK_EQ(p, dl_tensor.data); + if (platform::is_cpu_place(place)) { + CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else if (platform::is_gpu_place(place)) { + CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(boost::get(place).device, + dl_tensor.ctx.device_id); + } else if (platform::is_cuda_pinned_place(place)) { + CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else { + CHECK_EQ(false, true); + } + + CHECK_EQ(dims.size(), dl_tensor.ndim); + for (auto i = 0; i < dims.size(); ++i) { + CHECK_EQ(dims[i], dl_tensor.shape[i]); + } + + CHECK_EQ(dl_tensor.strides == nullptr, true); + CHECK_EQ(static_cast(0), dl_tensor.byte_offset); + + CHECK_EQ(lanes, dl_tensor.dtype.lanes); + CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits); + + CHECK_EQ(GetDLDataTypeCode(), dl_tensor.dtype.code); +} + +template +void TestMainLoop() { +#ifdef PADDLE_WITH_CUDA + std::vector places{platform::CPUPlace(), + platform::CUDAPlace(0), + platform::CUDAPinnedPlace()}; + if (platform::GetCUDADeviceCount() > 1) { + places.emplace_back(platform::CUDAPlace(1)); + } +#else + std::vector places{platform::CPUPlace()}; +#endif + std::vector lanes{1, 2}; + for (auto &p : places) { + for (auto &l : lanes) { + TestMain(p, l); + } + } +} + +#define PADDLE_DLPACK_TEST(type) \ + TEST(dlpack, test_##type) { TestMainLoop(); } + +using float16 = platform::float16; +PADDLE_DLPACK_TEST(float16); +PADDLE_DLPACK_TEST(float); +PADDLE_DLPACK_TEST(double); +PADDLE_DLPACK_TEST(int); +PADDLE_DLPACK_TEST(int64_t); +PADDLE_DLPACK_TEST(bool); +PADDLE_DLPACK_TEST(size_t); +PADDLE_DLPACK_TEST(int16_t); +PADDLE_DLPACK_TEST(uint8_t); +PADDLE_DLPACK_TEST(int8_t); + +#undef PADDLE_DLPACK_TEST + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 7ce08b728d9436c3b6e678faf328ddf1c45b7080..3dc571d75706b732fe9b254897b6cbd2e206cfc3 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr> gc; - // WhileOp would set keep_kids to false - // WhileGradOp would need the scopes created in WhileOp + // WhileOp would set keep_kids to true, + // because WhileGradOp needs the scopes created in WhileOp. // Perhaps, we should not perform eager deletion in WhileOp // The scopes and variables created by WhileOp would be deleted // in WhileGradOp. diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index cd2cb0c9f8a8ecc41a878cd3f711713cb5c23eb3..9696441a21661db89146c448742a992d1f7df022 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -15,7 +15,10 @@ #include "paddle/fluid/framework/ir/is_test_pass.h" #include - +#ifdef _WIN32 +#undef FALSE +#undef TRUE +#endif namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 669d08c70c9b7453264806b346a6c9eb211cfd4a..9b2eeaf59a536b5e8577f513b392648ed4270aab 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -26,10 +26,8 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" -#if !defined(_WIN32) #include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/writer.h" -#endif // _WIN32 namespace paddle { namespace framework { @@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, TensorFromStream(is, static_cast(tensor), dev_ctx); } -#if !defined(_WIN32) void WriteToRecordIO(recordio::Writer *writer, const std::vector &tensor, const platform::DeviceContext &dev_ctx) { @@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner, return true; } -#else -class Writer {}; -class Scanner {}; -void WriteToRecordIO(recordio::Writer *writer, - const std::vector &tensor, - const platform::DeviceContext &dev_ctx) {} -bool ReadFromRecordIO(recordio::Scanner *scanner, - const platform::DeviceContext &dev_ctx, - std::vector *result_ptr) { - PADDLE_ENFORCE("windows didn't supported recordio!."); - return true; -} -#endif // _WIN32 + std::vector LoDTensor::SplitLoDTensor( const std::vector places) const { check_memory_size(); diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index cbf5fd04d73007d303d0fd96064e3a2d7f21cfb8..cd50aaa26054b78f1b1e8f0d470b397892155a2b 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) { EXPECT_EQ(offset_lod, expected); } -#if !defined(_WIN32) template static void TestRecordIO() { LoDTensor tensor; @@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) { TestRecordIO(); TestRecordIO(); } -#endif // !defined(_WIN32) } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index e8e53f988f92685cd4854b21202bcf7f9b1a4383..e8295639520b5838dce3c9c9e443cc846bd9c1ec 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -83,6 +83,7 @@ void NaiveExecutor::Run() { for (auto &op : ops_) { VLOG(3) << std::this_thread::get_id() << " run " << op->Type() << " on scope " << scope_; + op->SetIsCalledByExecutor(false); op->Run(*scope_, place_); } } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index fbaa169df6324761ef9136aa173dce4e2182ed38..362cda3f2329bef1abaa93b4529e506d41f07606 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { this->attrs_[name] = std::vector(); break; } + case proto::AttrType::LONGS: { + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from LONGS to LONGS"; + this->attrs_[name] = std::vector(); + break; + } case proto::AttrType::FLOATS: { VLOG(110) << "SetAttr: " << Type() << ", " << name << " from INTS to FLOATS"; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 1ec170b6f65f9c3ee0f80fb8904026b5438c94b2..1d386a608ae1d98fd47b95fb489662f7edd9d1fb 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/profiler.h" @@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false, namespace paddle { namespace framework { -// Combine two hash values to a single hash. -inline size_t CombineHash(size_t seed, size_t a) { - return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2); -} - std::vector> kKernelPriority = { std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), @@ -153,17 +149,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } -// The profile has a process-wide mutex, results in serious performance issue -// in concurrency scenerio. Here use an `if` to fix this issue. -// Please not remove the `if`, ask @Superjomn if there are any concern. -#ifndef _WIN32 + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. if (platform::IsProfileEnabled()) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(Type(), pool.Get(place)); RunImpl(scope, place); - } else -#endif - { + } else { RunImpl(scope, place); } VLOG(30) << place << " " << DebugStringEx(&scope); @@ -797,17 +790,6 @@ void OperatorWithKernel::TransferInplaceVarsBack( Scope* OperatorWithKernel::TryTransferData( const Scope& scope, const OpKernelType& expected_kernel_key, std::vector* transfered_inplace_vars) const { -// In the inference scenerio, the scopes will be reused across the batches, so -// the `new_scope` here will result in GPU memroy explosion over the running of -// operators. -// We use a thread_local cache to fix that issue, the key in the cache is the -// combination of the `scope` argument, from_kernel_type, target_kernel_type. -// Have a discussion with @Superjomn or the inference developers if some changes -// on this logic for this macro might not tested on the other scenerios. -#ifdef PADDLE_ON_INFERENCE - thread_local std::unordered_map infer_transfer_scope_cache; -#endif - Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { for (auto& var_name : var_name_item.second) { @@ -838,23 +820,23 @@ Scope* OperatorWithKernel::TryTransferData( VLOG(30) << "Transform Variable " << var_name << " from " << kernel_type_for_var << " to " << expected_kernel_key; -#ifdef PADDLE_ON_INFERENCE - size_t infer_cache_key = - CombineHash(OpKernelType::Hash()(kernel_type_for_var), - OpKernelType::Hash()(expected_kernel_key)); - infer_cache_key = - CombineHash(infer_cache_key, std::hash()(&scope)); - - auto it = infer_transfer_scope_cache.find(infer_cache_key); - if (it != infer_transfer_scope_cache.end()) { - new_scope = infer_transfer_scope_cache[infer_cache_key]; - } else { - new_scope = &scope.NewScope(); - infer_transfer_scope_cache[infer_cache_key] = new_scope; + // In the inference scenerio, the scopes will be reused across the + // batches, so the `new_scope` here will result in GPU memroy explosion + // over the running of operators. + // We use a thread_local cache to fix that issue, the key in the cache is + // the combination of the `scope` argument, from_kernel_type, + // target_kernel_type. + // Have a discussion with @Superjomn or the inference developers if some + // changes on this logic for this macro might not tested on the other + // scenerios. + // If this op is not called by an Executor or ParallelExecutor, it should + // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and + // variables, that behavior a lot different. + if (!run_by_executor_) { + new_scope = TryCreateTransferScope(kernel_type_for_var, + expected_kernel_key, &scope); } -#endif - - if (new_scope == nullptr) { + if (!new_scope) { new_scope = &scope.NewScope(); } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index ef838332177c018865a922d570c697b4a94969b6..bfdfdc56b34098dd170f8ca98d27b41759c2f57b 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -127,6 +127,8 @@ class OperatorBase { //! Get all outputs variable names virtual std::vector OutputVars(bool has_intermediate) const; + void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } + protected: std::string type_; // NOTE: in case of OpGrad, inputs_ contains: @@ -139,6 +141,8 @@ class OperatorBase { // IG (Inputs Gradients) VariableNameMap outputs_; AttributeMap attrs_; + // Whether this operator executes in an Executor. + bool run_by_executor_{true}; private: void GenerateTemporaryNames(); diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc new file mode 100644 index 0000000000000000000000000000000000000000..f6219a14173094d15e9c60a2e26f98da1b04ec2e --- /dev/null +++ b/paddle/fluid/framework/transfer_scope_cache.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/transfer_scope_cache.h" + +namespace paddle { +namespace framework { + +// Holds all the transfer scope across the process. +std::unordered_map& global_transfer_data_cache() { + typedef std::unordered_map map_t; + thread_local std::unique_ptr x(new map_t); + return *x; +} + +// Holds all the transfer scope for this thread. +std::unordered_set& global_transfer_scope_cache() { + typedef std::unordered_set set_t; + thread_local std::unique_ptr x(new set_t); + return *x; +} + +// Try to create a transfer scope. If one cached scope has match the +// requirement, just return that one. +// Inputs: +// @type0: the source kernel type. +// @type1: the target kernel type. +// @scope: the execution scope of this op. +// Returns: A scope used to hold the transfer data across the different kernel +// type. +Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, + const Scope* scope) { + Scope* new_scope{nullptr}; + size_t infer_cache_key = + CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1)); + infer_cache_key = + CombineHash(infer_cache_key, std::hash()(scope)); + + auto it = global_transfer_data_cache().find(infer_cache_key); + if (it != global_transfer_data_cache().end()) { + new_scope = global_transfer_data_cache()[infer_cache_key]; + } else { + new_scope = &scope->NewScope(); + global_transfer_data_cache()[infer_cache_key] = new_scope; + } + global_transfer_scope_cache().insert(new_scope); + return new_scope; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..86fc0bf52972ae22a9bf269d71291f7fcd593880 --- /dev/null +++ b/paddle/fluid/framework/transfer_scope_cache.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +std::unordered_map& global_transfer_data_cache(); + +std::unordered_set& global_transfer_scope_cache(); + +// Combine two hash values to a single hash. +static size_t CombineHash(size_t seed, size_t a) { + return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, + const Scope* scope); + +void RemoveKidsFromTransferScopeCache(Scope* scope); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 2c5364b72402befd2c34e5f542ce5c6b2add621d..058a5b5f460d2bd3c4c0248929dd0c87f7506930 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -4,6 +4,7 @@ endif() # analysis and tensorrt must be added before creating static library, # otherwise, there would be undefined reference to them in static library. add_subdirectory(analysis) +add_subdirectory(utils) if (TENSORRT_FOUND) add_subdirectory(tensorrt) endif() diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 84a0c3374c66f85313828332099cb372e14c7c83..7710ed7b613a180a4e24237a2cf6b5f212f677a3 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { @@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) { 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) { LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e9969b84f33483b048951f704de1e13e51cbeaea..eda251c5346a6d970ecd0956f976cbef41e6c1c1 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce) cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS + lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config + analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array) cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 5ccd2dc5ab353b1634b651a4b7caa2af0da75ce4..dd75f0d9a65404908667d873786160ddaa73fa57 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; @@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cb14d2a2602808bd35106ed2bafcf7975f549597..ebe56734c6d6a9137e877f8a5ea188f59a1b4f98 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -31,11 +31,11 @@ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #endif #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); -DECLARE_int32(paddle_num_threads); namespace paddle { @@ -56,7 +56,6 @@ bool AnalysisPredictor::Init( const std::shared_ptr &parent_scope, const std::shared_ptr &program) { VLOG(30) << "Predictor::init()"; -#if !defined(_WIN32) if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; @@ -64,10 +63,9 @@ bool AnalysisPredictor::Init( : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } -#endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (!PrepareScope(parent_scope)) { return false; @@ -160,6 +158,14 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +void AnalysisPredictor::SetMkldnnThreadID(int tid) { +#ifdef PADDLE_WITH_MKLDNN + platform::set_cur_thread_id(tid); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; +#endif +} + bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { @@ -167,7 +173,6 @@ bool AnalysisPredictor::Run(const std::vector &inputs, inference::Timer timer; timer.tic(); // set feed variable - std::vector feeds; framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; @@ -208,17 +213,29 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; if (inputs[i].dtype == PaddleDType::INT64) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; } - // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. - std::memcpy(static_cast(input_ptr), inputs[i].data.data(), - inputs[i].data.length()); + if (platform::is_cpu_place(place_)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + } else { +#ifdef PADDLE_WITH_CUDA + auto dst_gpu_place = boost::get(place_); + memory::Copy(dst_gpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length(), + 0); // stream 0 for sync copy +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; for (auto &level : inputs[i].lod) { @@ -501,12 +518,10 @@ bool AnalysisPredictor::LoadParameters() { } AnalysisPredictor::~AnalysisPredictor() { -#if !defined(_WIN32) if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); } -#endif if (sub_scope_) { scope_->DeleteScope(sub_scope_); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index cf81b7db738d899566ddf32c5e5a40475c8e7bc7..db57812bc3ba8e6d578e665524cb5749e6bfecd6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } + void SetMkldnnThreadID(int tid); + protected: bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index fcbc3803d04def9a9855f2fee489e7e2c561b454..a7cef426d17908bfae59a4a5a204237ab580a5a1 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -24,11 +24,11 @@ limitations under the License. */ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); -DECLARE_int32(paddle_num_threads); namespace paddle { namespace { @@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() { bool NativePaddlePredictor::Init( std::shared_ptr parent_scope) { VLOG(3) << "Predictor::init()"; -#if !defined(_WIN32) if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; @@ -73,10 +72,9 @@ bool NativePaddlePredictor::Init( : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } -#endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); @@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init( } NativePaddlePredictor::~NativePaddlePredictor() { -#if !defined(_WIN32) if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); } -#endif if (sub_scope_) { scope_->DeleteScope(sub_scope_); } @@ -139,7 +135,6 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, Timer timer; timer.tic(); // set feed variable - std::vector feeds; framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get(); if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; @@ -195,17 +190,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; if (inputs[i].dtype == PaddleDType::INT64) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; } - // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. - std::memcpy(static_cast(input_ptr), inputs[i].data.data(), - inputs[i].data.length()); + if (platform::is_cpu_place(place_)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + } else { +#ifdef PADDLE_WITH_CUDA + auto dst_gpu_place = boost::get(place_); + memory::Copy(dst_gpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length(), + 0); // stream 0 for sync copy +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; for (auto &level : inputs[i].lod) { diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 49683eab07a2f5bc008272038a27bdb277396284..8fb464c0f5443f116815b14324f6cbc966dc6482 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -46,8 +46,6 @@ if(WITH_GPU) endif() endif(NOT WIN32) endif() - -include_directories("D:/Paddle/") include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include") diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 6f9d663121004470d57c17b8154d725fdf2b9689..9a393a61c4b4586009a022884e88f3f5c6392ed3 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,10 +15,6 @@ #pragma once #include -#if !defined(_WIN32) -#include -#else -#endif #include #include // NOLINT @@ -28,6 +24,7 @@ #include #include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" namespace paddle { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2ac736df7ccd54babe582ca1383903c191069d33..a09bd1cac2aa31b2ecee9b0f77d2b777104f1161 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig { int max_batch_size = 1); bool use_tensorrt() const { return use_tensorrt_; } + void EnableMKLDNN(); // NOTE this is just for internal development, please not use it. // NOT stable yet. - void EnableMKLDNN(); bool use_mkldnn() const { return use_mkldnn_; } friend class ::paddle::AnalysisPredictor; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 0a2a2a1a23401b5aa4d3402da6f7a3369280d8f5..1513a4b3b4f6620aa8bf53e082afa203af15ca13 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config { // Specify the variable's name of each input if input tensors don't follow the // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; + + // Set and get the number of cpu math library threads. + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; + } + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; + } + + protected: + // number of cpu math library (such as MKL, OpenBlas) threads for each + // instance. + int cpu_math_library_num_threads_{1}; }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 6620c76318f99092236d4009037f2ce01b295164..ae5b1b98060a4e73b2d1761d4edafb152f364070 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -19,9 +19,6 @@ namespace paddle { namespace inference { namespace tensorrt { -/* - * SplitOp. - */ class SplitOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, @@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); + // split on batch is not supported in TensorRT PADDLE_ENFORCE(axis != 0); - if (axis < 0) { - axis += input_dims.nbDims; - } else { - axis -= 1; - } + axis += (axis < 0) ? input_dims.nbDims : -1; PADDLE_ENFORCE(output_lengths.size() == output_num); - - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index f81d011552c152c2df79e1a272f34b954ae2a3a1..5aacc5c600dd1371e3865adc888bb8e24640e7d9 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -20,30 +20,92 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(split_op, test) { +template +void TensorRTSplitTest(const std::vector &in_shape, + const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2)); - validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2)); - validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2)); + TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000); + + auto make_dim = [](const std::vector &shape) { + nvinfer1::DimsCHW dim; + dim.c() = shape[0]; + dim.h() = shape[1]; + dim.w() = shape[2]; + return dim; + }; + validator.DeclInputVar("split_input", make_dim(in_shape)); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis - 1] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, make_dim(out_shape)); + output_vars.push_back(output_name); + } // Prepare Op description framework::OpDesc desc; desc.SetType("split"); desc.SetInput("X", {"split_input"}); - desc.SetOutput("Out", {"split_out1", "split_out2"}); + desc.SetOutput("Out", output_vars); - int num = 0; - int axis = 1; - std::vector output_lengths = {2, 1}; - desc.SetAttr("axis", axis); - desc.SetAttr("num", num); - desc.SetAttr("sections", output_lengths); + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); validator.SetOp(*desc.Proto()); - validator.Execute(1); + validator.Execute(BatchSize); +} + +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { + TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); +} +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { + TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); +} +// batch = 10, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch10) { + TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); +} +// batch = 10, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch10) { + TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); +} +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1}); +} +// batch = 10, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2}); +} +// batch = 10, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1}); +} +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1}); +} +// batch = 10, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2}); +} +// batch = 10, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1}); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 4adea2db1ee80fb20adba3cf4141a6485a1065a0..de61ace59e299a1f51940e4b433a0133d4fbe7ff 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { @@ -19,6 +21,52 @@ namespace inference { namespace tensorrt { namespace plugin { +// copied from operators::math::SplitFunctor +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + nvinfer1::Dims SplitPlugin::getOutputDimensions( int index, const nvinfer1::Dims* input_dims, int num_inputs) { PADDLE_ENFORCE_EQ(num_inputs, 1); @@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions( int SplitPlugin::initialize() { PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS); - + // notice input dims is [C, H, W] + nvinfer1::Dims dims = this->getInputDims(0); + outer_rows_ = 1; + inner_cols_ = 1; + for (int i = 0; i < axis_; ++i) { + outer_rows_ *= dims.d[i]; + } + for (int i = axis_ + 1; i < dims.nbDims; ++i) { + inner_cols_ *= dims.d[i]; + } + same_shape_ = true; std::vector segment_offsets(1, 0); for (int i = 0; i < this->getNbOutputs(); ++i) { - segment_offsets.push_back(segment_offsets.back() + output_length_[i]); + if (output_length_[i] != output_length_[0]) { + same_shape_ = false; + } + segment_offsets.push_back(segment_offsets.back() + + output_length_[i] * inner_cols_); } - segment_offsets_ = segment_offsets; - nvinfer1::Dims dims = this->getInputDims(0); - nx_ = 1; - for (int i = dims.nbDims - 1; i > axis_; --i) { - nx_ *= dims.d[i]; + inner_cols_ *= dims.d[axis_]; + d_segment_offsets_ = segment_offsets; + segment_offsets_ = std::move(segment_offsets); + d_output_ptrs_.resize(this->getNbOutputs(), nullptr); + return 0; +} + +template +inline void Split(cudaStream_t stream, const bool same_shape, + const int outer_rows, const int inner_cols, + const std::vector& segment_offsets, + const int* d_segment_offsets, const T* input, T** outputs) { + const int kThreadsPerBlock = 1024; + const int kMaxBlocks = 65535; + int block_cols = kThreadsPerBlock; + if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((inner_cols + 31) >> 5) << 5; } - ny_ = dims.d[axis_]; - nz_ = 1; - for (int i = axis_ - 1; i >= 0; --i) { - nz_ *= dims.d[i]; + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int grid_cols = + std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); + int grid_rows = + std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (same_shape) { + SplitKernel<<>>( + input, outer_rows, inner_cols, segment_offsets[1], outputs); + } else { + SplitKernel<<>>( + input, outer_rows, inner_cols, d_segment_offsets, + static_cast(segment_offsets.size()), outputs); } - return 0; } int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { - auto const& input_dims = this->getInputDims(0); - int input_size = 0; - float const* idata = reinterpret_cast(inputs[0]); - float** odatas = reinterpret_cast(outputs); - - // kernel impl here. - int inputBatchOffset = nx_ * ny_ * nz_; - for (size_t i = 0; i < this->getNbOutputs(); i++) { - for (size_t j = 0; j < batchSize; j++) { - cudaMemcpyAsync( - odatas[i] + - j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * - sizeof(float), - inputs[0] + - (inputBatchOffset * j + segment_offsets_[i] * nx_) * - sizeof(float), - (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float), - cudaMemcpyDeviceToDevice, stream); + float const* input_ptr = reinterpret_cast(inputs[0]); + if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && + this->getNbOutputs() < 10) { + float** output_ptrs = reinterpret_cast(outputs); + int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) + ? sizeof(float) + : sizeof(__half); + for (int i = 0; i < this->getNbOutputs(); ++i) { + PADDLE_ENFORCE( + cudaMemcpyAsync( + output_ptrs[i], input_ptr + segment_offsets_[i], + (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, + cudaMemcpyDeviceToDevice, stream) == cudaSuccess); + } + } else { + outer_rows_ *= batchSize; + const int* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); + float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); + PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs, + this->getNbOutputs() * sizeof(float*), + cudaMemcpyHostToDevice, + stream) == cudaSuccess); + if (this->getDataType() == nvinfer1::DataType::kFLOAT) { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, input_ptr, output_ptrs); + } else { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT + (__half**)output_ptrs); // NOLINT } } - return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index b5b6e69992b057a1478f61457b4ae6f5f1619b4d..6f028d3d72ae3cc7d96c6782b734cdbf1243c06c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,7 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: SplitPlugin(int axis, std::vector const &output_lengths) - : axis_(axis), output_length_(output_lengths) {} + : axis_(axis), same_shape_(true), output_length_(output_lengths) {} SplitPlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); @@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT { } int axis_; + int outer_rows_; + int inner_cols_; + bool same_shape_; std::vector output_length_; - int nx_, ny_, nz_; std::vector segment_offsets_; + thrust::device_vector d_segment_offsets_; + thrust::device_vector d_output_ptrs_; }; } // namespace plugin diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index e8bd13037ed6c2c3c639b76f6f3561921fb6ee37..7dc88d9dd052c59aaa59b7802ee5a38ea9d89bc6 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) - inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") + inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) @@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI - # anakin rnn1 - set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin") - set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1") - inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin") - inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt") - cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc - ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin - --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt - DEPS inference_anakin_api_shared SERIAL) - # anakin mobilenet - if(WITH_GPU) - set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet") - inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin") - cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc - ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin - DEPS inference_anakin_api_shared dynload_cuda SERIAL) - endif() + # anakin rnn1 + set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin") + set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1") + inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin") + inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt") + cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc + ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin + --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt + DEPS inference_anakin_api_shared SERIAL) + # anakin mobilenet + if(WITH_GPU) + set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet") + inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin") + cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc + ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api_shared dynload_cuda SERIAL) + endif() endif() if(WITH_GPU AND TENSORRT_FOUND) - set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") - if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) - inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") - endif() - inference_analysis_test(test_trt_models SRCS trt_models_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + endif() + inference_analysis_test(test_trt_models SRCS trt_models_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index c4022225fd4526998af8526d0afb87e7a5be6336..da42688f29f044639d6d97ff09f932490d4c3d54 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include #include #include diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 2b936175ed3f8ec24826485027048c82df0461ab..abc63577b7913a3c9de7d6c16d8ac3e85ffd7c3c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index aa0c4b1d049bc276cda2f58ac1edd8102fb3fd88..4231eef7220735d0b80eb1adc951c55ff7378f1b 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; + os << GenSpaces(num_spaces) + << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7b686045a59c93a93322f99c2cdf7050ddbf0a6d..d572ea0177c1e398229a02718ca31cc78a7059ef 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { @@ -177,11 +178,9 @@ void TestOneThreadPrediction( warmup_timer.tic(); predictor->Run(inputs[0], outputs, batch_size); PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); -#if !defined(_WIN32) if (FLAGS_profile) { paddle::platform::ResetProfiler(); } -#endif } LOG(INFO) << "Run " << num_times << " times..."; @@ -206,22 +205,23 @@ void TestMultiThreadPrediction( int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; std::vector threads; - std::vector> predictors; - predictors.emplace_back(CreateTestPredictor(config, use_analysis)); - for (int tid = 1; tid < num_threads; ++tid) { - predictors.emplace_back(predictors.front()->Clone()); - } + auto main_predictor = CreateTestPredictor(config, use_analysis); size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { -#ifdef PADDLE_WITH_MKLDNN - platform::set_cur_thread_id(static_cast(tid) + 1); -#endif // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; - auto &predictor = predictors[tid]; + // To ensure the thread binding correctly, + // please clone inside the threadpool. + auto predictor = main_predictor->Clone(); +#ifdef PADDLE_WITH_MKLDNN + if (use_analysis) { + static_cast(predictor.get()) + ->SetMkldnnThreadID(static_cast(tid) + 1); + } +#endif // warmup run LOG(INFO) << "Running thread " << tid << ", warm up run..."; @@ -230,11 +230,9 @@ void TestMultiThreadPrediction( warmup_timer.tic(); predictor->Run(inputs[0], outputs, batch_size); PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); -#if !defined(_WIN32) if (FLAGS_profile) { paddle::platform::ResetProfiler(); } -#endif } LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index cbcfc964c91c33ab41a72ad7fec759086ad887cc..5c1204b9e6b78e42d999b12a2b7be6f822ecf818 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include // NOLINT diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 2118fcfd4bb1589947617e462f09971fcc090b98..75fa611c0d701dd81dfe5b33231655e0959c7dbf 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(use_mkldnn); diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2104e4ac7222258ee025bd5acd60b1db251df654 --- /dev/null +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(benchmark SRCS benchmark.cc DEPS enforce) +cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..021edc2de5e90023fcd1431dd2025450e7462bd9 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +std::string Benchmark::SerializeToString() const { + std::stringstream ss; + ss << "-----------------------------------------------------\n"; + ss << "name\t"; + ss << "batch_size\t"; + ss << "num_threads\t"; + ss << "latency\t"; + ss << "qps"; + ss << '\n'; + + ss << name_ << "\t"; + ss << batch_size_ << "\t"; + ss << num_threads_ << "\t"; + ss << latency_ << "\t"; + ss << 1000 / latency_; + ss << '\n'; + return ss.str(); +} +void Benchmark::PersistToFile(const std::string &path) const { + std::ofstream file(path, std::ios::app); + PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path); + file << SerializeToString(); + file.flush(); + file.close(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..80e8f77adb4ff2cc81a2a3dd0c44e4e304800122 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +namespace paddle { +namespace inference { + +/* + * Helper class to calculate the performance. + */ +struct Benchmark { + int batch_size() const { return batch_size_; } + void SetBatchSize(int x) { batch_size_ = x; } + + int num_threads() const { return num_threads_; } + void SetNumThreads(int x) { num_threads_ = x; } + + bool use_gpu() const { return use_gpu_; } + void SetUseGpu() { use_gpu_ = true; } + + int latency() const { return latency_; } + void SetLatency(int x) { latency_ = x; } + + const std::string& name() const { return name_; } + void SetName(const std::string& name) { name_ = name; } + + std::string SerializeToString() const; + void PersistToFile(const std::string& path) const; + + private: + bool use_gpu_{false}; + int batch_size_{0}; + int latency_; + int num_threads_{1}; + std::string name_; +}; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb255474082b27180a8b3176b5f880c0d38f6c3b --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" +#include +#include + +using namespace paddle::inference; +TEST(Benchmark, basic) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString(); +} + +TEST(Benchmark, PersistToFile) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + + benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("1.log"); +} \ No newline at end of file diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index a0ce2875cb8337a59ec03730e5cf66d2fc622001..f0b215dac252475217a403e680a23559280b0e8d 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) { size_t thread_num = 32; size_t sleep_time = 40; - size_t extra_time = 2; + size_t extra_time = 10; // Reserve to perform more tests in the future std::vector> allocators; diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 501807e7f3e04ae75386bfa00797d244cd9eac9c..40b46781daa989fcd89887a3c01e97e39ea71255 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -46,7 +46,7 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) { auto* scores_data = scores->mutable_data(place); vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); vector _scores( - {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1}); + {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; @@ -80,7 +80,7 @@ TEST(DISABLED_beam_search_op, run) { ASSERT_EQ(sids.lod(), sscores.lod()); vector tids({4, 2, 3, 8}); - vector tscores({0.5, 0.6, 0.9, 0.7}); + vector tscores({0.5f, 0.6f, 0.9f, 0.7f}); for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 58f6f48467310ffb2429ad440f58fcd823edf079..6c85f1577e0c49d00f4ccf7fa7be0974eb62bdf3 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -22,7 +22,7 @@ iou_similarity_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) -detection_library(density_prior_box_op SRCS density_prior_box_op.cc) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 99df15c3226b4305a28a3912398d6d1c766daa73..1012ba3652ddfb06af9292e8061684481f1dbef3 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { auto fixed_sizes = ctx->Attrs().Get>("fixed_sizes"); auto fixed_ratios = ctx->Attrs().Get>("fixed_ratios"); auto densities = ctx->Attrs().Get>("densities"); + bool flatten = ctx->Attrs().Get("flatten_to_2d"); PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(), "The number of fixed_sizes and densities must be equal."); size_t num_priors = 0; - if ((fixed_sizes.size() > 0) && (densities.size() > 0)) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + if (!flatten) { + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } else { + int64_t dim0 = input_dims[2] * input_dims[3] * num_priors; + ctx->SetOutputDim("Boxes", {dim0, 4}); + ctx->SetOutputDim("Variances", {dim0, 4}); } - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_priors; - dim_vec[3] = 4; - ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); - ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); } protected: @@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + ctx.GetPlace()); } }; @@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") .SetDefault(true); - + AddAttr("flatten_to_2d", + "(bool) Whether to flatten to 2D and " + "the second dim is 4.") + .SetDefault(false); AddAttr( "step_w", "Density prior boxes step across width, 0.0 for auto calculation.") diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..3b7c781795f02b9d9c9f2ead51034193ceb2a745 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +template +static __device__ inline T Clip(T in) { + return min(max(in, 0.), 1.); +} + +template +static __global__ void GenDensityPriorBox( + const int height, const int width, const int im_height, const int im_width, + const T offset, const T step_width, const T step_height, + const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin, + const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) { + int gidx = blockIdx.x * blockDim.x + threadIdx.x; + int gidy = blockIdx.y * blockDim.y + threadIdx.y; + int step_x = blockDim.x * gridDim.x; + int step_y = blockDim.y * gridDim.y; + + const T* width_ratio = ratios_shift; + const T* height_ratio = ratios_shift + num_priors; + const T* width_shift = ratios_shift + 2 * num_priors; + const T* height_shift = ratios_shift + 3 * num_priors; + + for (int j = gidy; j < height; j += step_y) { + for (int i = gidx; i < width * num_priors; i += step_x) { + int h = j; + int w = i / num_priors; + int k = i % num_priors; + + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + + T center_x_temp = center_x + width_shift[k]; + T center_y_temp = center_y + height_shift[k]; + + T box_width_ratio = width_ratio[k] / 2.; + T box_height_ratio = height_ratio[k] / 2.; + + T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.); + T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.); + T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.); + T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.); + + int out_offset = (j * width * num_priors + i) * 4; + out[out_offset] = is_clip ? Clip(xmin) : xmin; + out[out_offset + 1] = is_clip ? Clip(ymin) : ymin; + out[out_offset + 2] = is_clip ? Clip(xmax) : xmax; + out[out_offset + 3] = is_clip ? Clip(ymax) : ymax; + + var[out_offset] = var_xmin; + var[out_offset + 1] = var_ymin; + var[out_offset + 2] = var_xmax; + var[out_offset + 3] = var_ymax; + } + } +} + +template +class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto is_clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + int step_average = static_cast((step_width + step_height) * 0.5); + + framework::Tensor h_temp; + T* tdata = h_temp.mutable_data({num_priors * 4}, platform::CPUPlace()); + int idx = 0; + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = shift / 2. + dj * shift - step_average / 2.; + float center_y_temp = shift / 2. + di * shift - step_average / 2.; + tdata[idx] = box_width_ratio; + tdata[num_priors + idx] = box_height_ratio; + tdata[2 * num_priors + idx] = center_x_temp; + tdata[3 * num_priors + idx] = center_y_temp; + idx++; + } + } + } + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor d_temp; + framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); + + // At least use 32 threads, at most 512 threads. + // blockx is multiple of 32. + int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int gridx = (feature_width * num_priors + blockx - 1) / blockx; + dim3 threads(blockx, 1); + dim3 grids(gridx, feature_height); + + auto stream = + ctx.template device_context().stream(); + GenDensityPriorBox<<>>( + feature_height, feature_width, img_height, img_width, offset, + step_width, step_height, num_priors, d_temp.data(), is_clip, + variances[0], variances[1], variances[2], variances[3], + boxes->data(), vars->data()); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(density_prior_box, + ops::DensityPriorBoxOpCUDAKernel, + ops::DensityPriorBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index 9a52077e9cf90b278549a077af161bd4e282d972..ed2f5df80cf4d7a5a44af9b09f3b048b1b14cdb9 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { step_height = step_h; } int num_priors = 0; - if (fixed_sizes.size() > 0 && densities.size() > 0) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); - auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); + auto box_dim = vars->dims(); + boxes->Resize({feature_height, feature_width, num_priors, 4}); + auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); int step_average = static_cast((step_width + step_height) * 0.5); for (int h = 0; h < feature_height; ++h) { @@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { auto fixed_size = fixed_sizes[s]; int density = densities[s]; // Generate density prior boxes with fixed ratios. - if (fixed_ratios.size() > 0) { - for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); - for (int di = 0; di < density; ++di) { - for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; - idx++; - } + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = + center_x - step_average / 2. + shift / 2. + dj * shift; + float center_y_temp = + center_y - step_average / 2. + shift / 2. + di * shift; + e_boxes(h, w, idx, 0) = + (center_x_temp - box_width_ratio / 2.) / img_width >= 0 + ? (center_x_temp - box_width_ratio / 2.) / img_width + : 0; + e_boxes(h, w, idx, 1) = + (center_y_temp - box_height_ratio / 2.) / img_height >= 0 + ? (center_y_temp - box_height_ratio / 2.) / img_height + : 0; + e_boxes(h, w, idx, 2) = + (center_x_temp + box_width_ratio / 2.) / img_width <= 1 + ? (center_x_temp + box_width_ratio / 2.) / img_width + : 1; + e_boxes(h, w, idx, 3) = + (center_y_temp + box_height_ratio / 2.) / img_height <= 1 + ? (center_y_temp + box_height_ratio / 2.) / img_height + : 1; + idx++; } } } @@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); vars->Resize(var_dim); + boxes->Resize(box_dim); } }; // namespace operators diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index c28f86146d3040c6a26cabfb795eff67375d4b76..7ce9669517bf115274e314e1116f8a5e69277b50 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "glog/logging.h" // For VLOG @@ -20,8 +19,11 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(rpc_disable_reuse_port); + namespace paddle { namespace operators { namespace distributed { @@ -383,6 +385,9 @@ std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { // Channel configurations: grpc::ChannelArguments args; args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); + if (FLAGS_rpc_disable_reuse_port) { + args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); + } args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetMaxSendMessageSize(std::numeric_limits::max()); args.SetMaxReceiveMessageSize(std::numeric_limits::max()); diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index f27b70a5a3dd2927b51a95af7bd1b84a6e232f86..e6856676d49e867214801810949076151e34356a 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -26,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h index 7ec489e961630747ba00e68ad3603cacbb1aa485..17290d3fb4478191c59623913a82d4142d3c49f9 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.h +++ b/paddle/fluid/operators/distributed/grpc_serde.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include + #include #include #include @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index ffd2b1707bea6c9379dc09c629fa4c920dac8ed0..77bf67be25780b1fe18c0b2c56680b549637d547 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -20,6 +20,8 @@ limitations under the License. */ using ::grpc::ServerAsyncResponseWriter; +DECLARE_bool(rpc_disable_reuse_port); + namespace paddle { namespace operators { namespace distributed { @@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() { VLOG(40) << "AsyncGRPCServer WaitSeverReady"; } +// Define an option subclass in order to disable SO_REUSEPORT for the +// server socket. +// Come from: +// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +class NoReusePortOption : public ::grpc::ServerBuilderOption { + public: + void UpdateArguments(::grpc::ChannelArguments* args) override { + args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); + } + + void UpdatePlugins(std::vector>* + plugins) override {} +}; + void AsyncGRPCServer::StartServer() { ::grpc::ServerBuilder builder; builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), @@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() { builder.SetMaxSendMessageSize(std::numeric_limits::max()); builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); + if (FLAGS_rpc_disable_reuse_port) { + builder.SetOption( + std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); + } builder.RegisterService(&service_); for (auto t : rpc_call_map_) { diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 374fa680e3681d2e4b1d7513a9522810a15fe485..6ba883ba01f73bd941e7b1b58afb777e2588764b 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -15,12 +15,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/platform/port.h" + +DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 480fc59c4281edbfa5f08e07a86c5f1257adb4be..523e56fe3e4eaf9d4faa8f0bf91d0b6075e0b0e5 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include #include #include #include @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 5d468316e8eacb73c4a4ce81c784880bb5e46c2d..a0397acab1267365b8aeba30a63152b61b5b25bb 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -1,2 +1,6 @@ include(operators) -register_operators() +register_operators(EXCLUDES fusion_transpose_flatten_concat_op) +if (WITH_GPU) + op_library(fusion_transpose_flatten_concat_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") +endif() diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 7e34d1019c9e6577b50ff8c2fa3d767124b5ff3b..25b7ae7c2828b0919b805a2d0aac8b9cacf7d6e3 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -183,24 +183,27 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const auto& ker = math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const std::string&, const std::string&>( \ - ctx.Attr("gate_activation"), \ - ctx.Attr("activation"), D); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const math::jitkernel::gru_attr_t attr( \ + D, ctx.Attr("gate_activation"), \ + ctx.Attr("activation")); \ + math::jitkernel::gru_t one_step; \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const math::jitkernel::gru_attr_t&>(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { @@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel { if (h0_data) { prev_hidden_data = h0_data + bid * D; } else { - ker->ComputeH1(xx_data, hidden_out_data); + one_step.gates = xx_data; + one_step.ht = hidden_out_data; + ker->ComputeH1(&one_step, &attr); prev_hidden_data = hidden_out_data; tstart = 1; move_step(); @@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel { blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast(1), prev_hidden_data, D, wh_data, D2, static_cast(1), xx_data, D3); - ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data); + one_step.gates = xx_data; + one_step.ht_1 = prev_hidden_data; + one_step.ht = hidden_out_data; + ker->ComputeHtPart1(&one_step, &attr); // gemm rt * Ws blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), hidden_out_data, D, wh_state_data, D, static_cast(1), xx_data + D2, D3); - ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data); + ker->ComputeHtPart2(&one_step, &attr); // save prev prev_hidden_data = hidden_out_data; move_step(); @@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; // W: {W_update, W_reset; W_state} for (int i = 0; i < max_bs; ++i) { - ker->ComputeH1(cur_in_data, cur_out_data); + one_step.gates = cur_in_data; + one_step.ht = cur_out_data; + ker->ComputeH1(&one_step, &attr); // add offset cur_in_data += D3; cur_out_data += D; @@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; T* cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data, - cur_out_data); + one_step.gates = cur_batched_data; + one_step.ht_1 = cur_prev_hidden_data; + one_step.ht = cur_out_data; + ker->ComputeHtPart1(&one_step, &attr); + cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; @@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel { cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data, - cur_out_data); + one_step.gates = cur_batched_data; + one_step.ht_1 = cur_prev_hidden_data; + one_step.ht = cur_out_data; + ker->ComputeHtPart2(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 0959539068eef5b550a8e3997d3f11ea67ae0707..8021a896ceaa808f5754b5d165aa0c8c8cb8034a 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -236,27 +236,31 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, const std::string&, \ - const std::string&, const std::string&>( \ - ctx.Attr("gate_activation"), \ - ctx.Attr("candidate_activation"), \ - ctx.Attr("cell_activation"), D, use_peepholes) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const math::jitkernel::lstm_attr_t attr( \ + D, ctx.Attr("gate_activation"), \ + ctx.Attr("candidate_activation"), \ + ctx.Attr("cell_activation"), use_peepholes); \ + math::jitkernel::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const math::jitkernel::lstm_attr_t&>(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_h_data = h0_data + bid * D; prev_c_data = c0_data + bid * D; } else { - ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data); + one_step.gates = xx_data; + one_step.ct = c_out_data; + one_step.ht = h_out_data; + ker->ComputeC1H1(&one_step, &attr); tstart = 1; // move one step prev_h_data = h_out_data; @@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel { } for (int step = tstart; step < seq_len; ++step) { GEMM_WH_ADDON(1, prev_h_data, xx_data); - ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data, - checked_cell_data); + + one_step.gates = xx_data; + one_step.ct_1 = prev_c_data; + one_step.ct = c_out_data; + one_step.ht = h_out_data; + ker->ComputeCtHt(&one_step, &attr); // move one step prev_h_data = h_out_data; prev_c_data = c_out_data; @@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; for (int i = 0; i < max_bs; ++i) { - ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data); + one_step.gates = cur_in_data; + one_step.ct = cur_c_out_data; + one_step.ht = cur_h_out_data; + ker->ComputeC1H1(&one_step, &attr); + cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; @@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_c_out_data = batched_c_out_data; T* cur_h_out_data = batched_h_out_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data, wp_data, checked_cell_data); + one_step.gates = cur_in_data; + one_step.ct_1 = cur_prev_c_data; + one_step.ct = cur_c_out_data; + one_step.ht = cur_h_out_data; + ker->ComputeCtHt(&one_step, &attr); + // move one batch cur_in_data += D4; cur_prev_c_data += D; diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..39356c9afccbf9af3eacf99a6bccb15e18f7e485 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + "Inputs(X) of ConcatOp should be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ConcatOp should not be null."); + + auto ins = ctx->GetInputsDim("X"); + const size_t n = ins.size(); + PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0."); + + std::vector trans_axis = + ctx->Attrs().Get>("trans_axis"); + int flatten_axis = ctx->Attrs().Get("flatten_axis"); + int concat_axis = ctx->Attrs().Get("concat_axis"); + + size_t x_rank = ins[0].size(); + size_t trans_axis_size = trans_axis.size(); + PADDLE_ENFORCE_EQ(x_rank, trans_axis_size, + "The input tensor's rank(%d) " + "should be equal to the permutation axis's size(%d)", + x_rank, trans_axis_size); + + auto dims0 = + GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[0])); + std::vector out_dims(dims0); + for (size_t i = 1; i < n; i++) { + auto dimsi = + GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[i])); + for (int j = 0; j < static_cast(dims0.size()); j++) { + if (j == concat_axis) { + out_dims[concat_axis] += dimsi[j]; + } else { + PADDLE_ENFORCE_EQ(out_dims[j], dimsi[j], + "After flatting, the %d-th dim should be save " + "except the specify axis.", + j); + } + } + } + if (out_dims[concat_axis] < 0) { + out_dims[concat_axis] = -1; + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } +}; + +class TransposeFlattenConcatFusionOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor) The input tensor, tensors with rank up to 6 are supported.") + .AsDuplicable(); + AddOutput("Out", "(Tensor)The output tensor."); + AddAttr>( + "trans_axis", + "(vector) A list of values, and the size of the list should be " + "the same with the input tensor rank. This operator permutes the input " + "tensor's axes according to the values given."); + AddAttr("flatten_axis", + "(int)" + "Indicate up to which input dimensions (exclusive) should be" + "flattened to the outer dimension of the output. The value" + "for axis must be in the range [0, R], where R is the rank of" + "the input tensor. When axis = 0, the shape of the output" + "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the" + "input tensor is (d_0, d_1, ... d_n)."); + AddAttr("concat_axis", + "The axis along which the input tensors will be concatenated. " + "It should be 0 or 1, since the tensor is 2D after flatting."); + AddComment(R"DOC( + + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_transpose_flatten_concat, + ops::TransposeFlattenConcatFusionOp, + ops::TransposeFlattenConcatFusionOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ccb670d73c803bb1b9827f0f30b99d272bfce79 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +template +using CudnnDataType = platform::CudnnDataType; + +template +class TransposeFlattenConcatFusionKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + auto odims = out->dims(); + + std::vector trans_axis = ctx.Attr>("trans_axis"); + int flatten_axis = ctx.Attr("flatten_axis"); + int concat_axis = ctx.Attr("concat_axis"); + + int rank = ins[0]->dims().size(); + // use at least 4D in cudnnTransformTensor + int max_dim = rank < 4 ? 4 : rank; + std::vector stride_x(max_dim, 0); + std::vector stride_y(max_dim, 0); + std::vector dims_y(max_dim, 0); + + cudnnTensorDescriptor_t in_desc; + cudnnTensorDescriptor_t out_desc; + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); + cudnnDataType_t cudnn_dtype = CudnnDataType::type; + + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + + T* odata = out->data(); + for (size_t k = 0; k < ins.size(); ++k) { + auto perm_shape = GetPermuteShape(trans_axis, ins[k]->dims()); + int osize = 1; + auto idims = ins[k]->dims(); + for (int i = 0; i < rank; i++) { + stride_x[i] = 1; + for (int j = trans_axis[i] + 1; j < rank; j++) { + stride_x[i] *= idims[j]; + } + dims_y[i] = perm_shape[i]; + osize *= perm_shape[i]; + } + stride_y[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + if (((i + 1) == flatten_axis) && (concat_axis == 1)) { + stride_y[i] = odims[1]; + } else { + stride_y[i] = stride_y[i + 1] * perm_shape[i + 1]; + } + } + + // Since concat is aftern flatten, the output is 2D tensor. + // If concat_axis is 0, each input's permutated tensor is continuous. + // If concat_axis is 1, the stride of 0-th dim of each input's + // permutated tensor is odims()[1]. + + for (int i = rank; i < max_dim; i++) { + stride_x[i] = 1; + stride_y[i] = 1; + dims_y[i] = 1; + } + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data())); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data())); + + CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( + handle, CudnnDataType::kOne(), in_desc, + static_cast(ins[k]->data()), + CudnnDataType::kZero(), out_desc, static_cast(odata))); + if (concat_axis == 0) { + odata += osize; + } else { + auto flat_shape = GetFlattenShape(flatten_axis, perm_shape); + odata += flat_shape[1]; + } + } + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fusion_transpose_flatten_concat, + ops::TransposeFlattenConcatFusionKernel, + ops::TransposeFlattenConcatFusionKernel); diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..66d5bea679fc85ce6b1ba64921107aef987ccaa8 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +inline std::vector GetPermuteShape(const std::vector& axis, + const framework::DDim& in_dims) { + std::vector out_dims(in_dims.size()); + for (size_t i = 0; i < axis.size(); i++) { + out_dims[i] = in_dims[axis[i]]; + } + return out_dims; +} + +inline std::vector GetFlattenShape(const int axis, + const std::vector& in_dims) { + int64_t outer = 1, inner = 1; + for (int i = 0; i < static_cast(in_dims.size()); ++i) { + if (i < axis) { + outer *= in_dims[i]; + } else { + inner *= in_dims[i]; + } + } + std::vector out_shape(2); + out_shape[0] = outer; + out_shape[1] = inner; + return out_shape; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 8f979e05d31e5a85bc86784943f4588ab650f668..4d25822259a2788322b4246696a0f102d6da51a7 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -76,11 +76,12 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("out_h", "output height of interpolate op."); AddAttr("out_w", "output width of interpolate op."); - AddAttr( - "interp_method", - "(string), interpolation method, can be \"bilinear\" for " - "bilinear interpolation and \"nearest\" for nearest " - "neighbor interpolation."); + AddAttr("interp_method", + "(string, default \"bilinear\"), interpolation " + "method, can be \"bilinear\" for " + "bilinear interpolation and \"nearest\" for nearest " + "neighbor interpolation.") + .SetDefault("bilinear"); AddComment(R"DOC( This operator samples input X to given output shape by using specified interpolation method, the interpolation methods can be \"nearest\" @@ -132,11 +133,19 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker, +REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad); -REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel, +REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad); +REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad); +REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel, + ops::InterpolateKernel, + ops::InterpolateKernel); +REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::InterpolateGradKernel, + ops::InterpolateGradKernel); +REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel, ops::InterpolateKernel, ops::InterpolateKernel); -REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel, +REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel, ops::InterpolateGradKernel); diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 190afbdac431f863c32e2a4a4b3ad83848e550fc..99ac725f73bf60ab0fb9a467432e8a57c646ef35 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -284,9 +284,15 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel, +REGISTER_OP_CUDA_KERNEL(bilinear_interp, ops::InterpolateOpCUDAKernel, ops::InterpolateOpCUDAKernel, ops::InterpolateOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(interpolate_grad, +REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad, + ops::InterpolateGradOpCUDAKernel, + ops::InterpolateGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel, + ops::InterpolateOpCUDAKernel, + ops::InterpolateOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(nearest_interp_grad, ops::InterpolateGradOpCUDAKernel, ops::InterpolateGradOpCUDAKernel); diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc index a6843f20a59a23bd4e875b0f96524cc8d7aa46d6..1b55527fd33e879c5c6fe702a53d0a23bebc2b14 100644 --- a/paddle/fluid/operators/lookup_sparse_table_op.cc +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase { framework::proto::VarType::FP32, "The sparse table only support FP32"); w_t->Get(ids_t, out_t, true, is_test); + out_t->set_lod(ids_t.lod()); } }; diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d84c88cb3bc1a13acb83b3444dbd1bfca3cba503..d35073029a3440d8a17e383ce97fcfc582663888 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -16,6 +16,9 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_bool(enable_cublas_tensor_op_math); namespace paddle { namespace operators { @@ -42,11 +45,44 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...)); #else PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const float *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const float *beta, void *C, + cudaDataType_t Ctype, int ldc) { + // Because the gcc 4.8 doesn't expand template parameter pack that + // appears in a lambda-expression, I can not use template parameter pack + // here. + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + VLOG(5) << "use_tensor_op_math: " + << (platform::TensorCoreAvailable() ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc)); +#else + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -69,13 +105,18 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...)); #else PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5"); #endif } + + template + static void GEMM_EX(ARGS... args) { + PADDLE_THROW("Currently there are not cublasDgemmEx."); + } }; template <> @@ -96,14 +137,16 @@ struct CUBlas { reinterpret_cast<__half *>(C), ldc)); } - static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const float16 *alpha, const float16 *A, int lda, - long long int strideA, const float16 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const float16 *beta, float16 *C, int ldc, - long long int strideC, // NOLINT - int batchCount) { + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float16 *alpha, const float16 *A, + int lda, long long int strideA, // NOLINT + const float16 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const float16 *beta, float16 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( handle, transa, transb, m, n, k, @@ -114,6 +157,45 @@ struct CUBlas { ldc, strideC, batchCount)); #else PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const void *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const void *beta, void *C, + cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); +#else + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -133,8 +215,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, N); +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, N); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, N); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 } template <> @@ -157,30 +252,18 @@ inline void Blas::GEMM( PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53, "cublas fp16 gemm requires GPU compute capability >= 53"); -#if CUDA_VERSION >= 8000 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - if (context_.GetComputeCapability() >= 70) { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH)); - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } else { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_DEFAULT_MATH)); - } -#endif // CUDA_VERSION >= 9000 - +#if CUDA_VERSION >= 8000 // cublasHgemm does true FP16 computation which is slow for non-Volta // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B, - CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, - CUDA_R_32F, algo)); + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A, + CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, @@ -199,8 +282,38 @@ void Blas::GEMM(bool transA, bool transB, int M, // the cblas convention. cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, ldc); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + +template <> +template <> +inline void Blas::GEMM( + bool transA, bool transB, int M, int N, int K, platform::float16 alpha, + const platform::float16 *A, int lda, const platform::float16 *B, int ldb, + platform::float16 beta, platform::float16 *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc); } template <> @@ -238,9 +351,34 @@ void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; - CUBlas::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, - strideC, batchCount); +#if CUDA_VERSION >= 9010 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto cublas_call = [&]() { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, + CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + }; + auto &dev_ctx = const_cast(context_); + dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + } else { +#endif // CUDA_VERSION >= 9010 + + CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, strideB, A, lda, + strideA, &beta, C, ldc, strideC, batchCount); + +#if CUDA_VERSION >= 9010 + } +#endif // CUDA_VERSION >= 9010 } } // namespace math diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 18a586f8dd9f01357d9facca19c51ed5c293ffd2..ad734bae425f3b3edf3ed57474285c1b5a754416 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include @@ -22,6 +21,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/port.h" inline double GetCurrentUS() { struct timeval time; diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index b072b4c20a171d148bd892c162436d03da404fb9..5b9953a5aa9a29bd917d16a16c678fc32a32c18f 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -17,8 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/jit_kernel.h" -DECLARE_int32(paddle_num_threads); - namespace paddle { namespace operators { namespace math { @@ -43,7 +41,7 @@ inline void FCCompute(const BlasT& blas, const int M, .template Get>(N); #ifdef PADDLE_WITH_MKLML -#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) +#pragma omp parallel for #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4..521cd7801abd6bea0c4f27422141742921dddf53 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include -#include #include #include "paddle/fluid/operators/math/im2col_cfo_cpu.h" +#include "paddle/fluid/platform/port.h" template void testIm2col() { diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e484e9a3c705c5638fa94010a4513ae1566a8be3..52cbdf685dee651cbc1490dc6faacb8680004c89 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_code.h" +#include // offsetof #include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me namespace paddle { @@ -139,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) { } void VActJitCode::generate() { - xmm_t xmm_zero = xmm_t(2); - ymm_t ymm_zero = ymm_t(2); - if (type_ == operand_type::relu) { - vxorps(ymm_zero, ymm_zero, ymm_zero); - } int offset = 0; for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { vmovups(ymm_src, ptr[param1 + offset]); - switch (type_) { - case operand_type::relu: - relu_jmm(ymm_dst, ymm_src, ymm_zero); - break; - case operand_type::exp: - exp_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::identity: - break; - default: - break; - } + act(ymm_dst, ymm_src, type_); vmovups(ptr[param2 + offset], ymm_dst); offset += sizeof(float) * YMM_FLOAT_BLOCK; } @@ -181,22 +160,7 @@ void VActJitCode::generate() { block = 1; vmovss(xmm_src, ptr[param1 + offset]); } - switch (type_) { - case operand_type::relu: - relu_jmm(xmm_dst, xmm_src, xmm_zero); - break; - case operand_type::exp: - exp_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - default: - break; - } + act(xmm_dst, xmm_src, type_); if (rest >= 4) { vmovups(ptr[param2 + offset], xmm_dst); } else if (rest >= 2) { @@ -210,6 +174,158 @@ void VActJitCode::generate() { ret(); } +bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } + +void LSTMJitCode::generate() { + if (use_peephole_) { + preCode(); + } + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + reg64_t reg_ptr_wp = r12; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + if (use_peephole_) { + mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]); + } + + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + /* gates: W_ch, W_ih, W_fh, W_oh */ + ymm_t ymm_c = ymm_t(0); + ymm_t ymm_i = ymm_t(1); + ymm_t ymm_f = ymm_t(2); + ymm_t ymm_o = ymm_t(3); + ymm_t ymm_ct_1 = ymm_t(4); + ymm_t ymm_wp0 = ymm_t(5); + ymm_t ymm_wp1 = ymm_t(6); + ymm_t ymm_wp2 = ymm_t(7); + vmovups(ymm_c, ptr[reg_ptr_gates + offset]); + vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]); + vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]); + vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]); + if (!compute_c1h1_) { + vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); + } + if (use_peephole_) { + vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]); + vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]); + vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]); + } + /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */ + // act_cand(c) + act(ymm_c, ymm_c, act_cand_); + // act_gate(i) or act_gate(ct_1 * wp0 + i) + if (!compute_c1h1_ && use_peephole_) { + vmulps(ymm_wp0, ymm_ct_1, ymm_wp0); + vaddps(ymm_i, ymm_i, ymm_wp0); + } + act(ymm_i, ymm_i, act_gate_); + vmulps(ymm_c, ymm_c, ymm_i); + if (!compute_c1h1_) { + // act_gate(f) or act_gate(ct_1 * wp1 + f) + if (use_peephole_) { + vmulps(ymm_wp1, ymm_ct_1, ymm_wp1); + vaddps(ymm_f, ymm_f, ymm_wp1); + } + act(ymm_f, ymm_f, act_gate_); + // ct + vmulps(ymm_f, ymm_f, ymm_ct_1); + vaddps(ymm_f, ymm_f, ymm_c); + } + /* H_t = act_cell(C_t) * act_gate(o) */ + // act_cell(C_t) + ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; + ymm_t ymm_tmp = ymm_i; + act(ymm_tmp, ymm_ct, act_cell_); + // act_gate(o) or act_gate(ct * wp2 + o) + if (use_peephole_) { + vmulps(ymm_wp2, ymm_ct, ymm_wp2); + vaddps(ymm_o, ymm_o, ymm_wp2); + } + act(ymm_o, ymm_o, act_gate_); + // ht + vmulps(ymm_o, ymm_o, ymm_tmp); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + + if (use_peephole_) { + postCode(); + } else { + ret(); + } +} + +bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } + +void GRUJitCode::generate() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ht_1 = r9; + reg64_t reg_ptr_ht = r10; + mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]); + mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]); + ymm_t ymm_one = ymm_t(0); + + if (id_ == 2) { + reg64_t reg_ptr_tmp = r11; + mov(reg_ptr_tmp, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]); + } + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + ymm_t ymm_u = ymm_t(1); + ymm_t ymm_r = ymm_t(2); + ymm_t ymm_s = ymm_t(3); + ymm_t ymm_ht_1 = ymm_t(4); + // W: {W_update, W_reset; W_state} + if (id_ == 0 || id_ == 2) { + vmovups(ymm_u, ptr[reg_ptr_gates + offset]); + vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]); + } + if (id_ == 1) { + vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]); + } + if (id_ == 1 || id_ == 2) { + vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]); + } + + if (id_ == 0) { + // ht = act_gate(u) * act_cand(s) + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_s); + } else if (id_ == 1) { + // ht = act_gate(r) * ht_1 + act(ymm_r, ymm_r, act_gate_); + vmulps(ymm_r, ymm_r, ymm_ht_1); + vmovups(ptr[reg_ptr_ht + offset], ymm_r); + } else if (id_ == 2) { + // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 + ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx()); + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vsubps(ymm_u, ymm_one_inner, ymm_u); + vmulps(ymm_u, ymm_ht_1, ymm_u); + vaddps(ymm_u, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_u); + } + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + + ret(); +} } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 64ef55de7cf73fea4538cc0d8fa6d316ddaff2f8..a9214621295a7740b804b26c02d216dd5118d8bb 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/jit_gen.h" +#include "paddle/fluid/operators/math/jit_kernel_impl.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -46,14 +47,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -// TODO(TJ): move these to some proper place -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - #define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f @@ -176,31 +169,34 @@ class VActJitCode : public JitCode { protected: // compute relu with ymm, xmm template - void relu_jmm(JMM& dst, JMM& src, JMM& zero) { // NOLINT + void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); vmaxps(dst, src, zero); } // compute exp with ymm, xmm template - void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT - int mask_idx = 4, int tmp_idx = 5) { - using namespace platform::jit; // NOLINT - assert(src.getIdx() != dst.getIdx()); // TODO(TJ): use enfore + void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT + int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { + using namespace platform::jit; // NOLINT // check all idx can not equal + JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); JMM jmm_fy = JMM(fy_idx); JMM jmm_mask = JMM(mask_idx); JMM jmm_tmp = JMM(tmp_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); - vminps(src, src, jmm_tmp); + vminps(jmm_src, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); - vmaxps(src, src, jmm_tmp); + vmaxps(jmm_src, jmm_src, jmm_tmp); // express exp(x) as exp(g + n*log(2)) vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); - vmulps(jmm_fx, src, jmm_tmp); + vmulps(jmm_fx, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); vaddps(jmm_fx, jmm_fx, jmm_tmp); vroundps(jmm_fy, jmm_fx, 0x01); @@ -214,21 +210,21 @@ class VActJitCode : public JitCode { vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); JMM ymm_z = JMM(jmm_mask.getIdx()); vmulps(ymm_z, jmm_fx, jmm_tmp); - vsubps(src, src, jmm_fy); - vsubps(src, src, ymm_z); - vmulps(ymm_z, src, src); + vsubps(jmm_src, jmm_src, jmm_fy); + vsubps(jmm_src, jmm_src, ymm_z); + vmulps(ymm_z, jmm_src, jmm_src); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); - vmulps(dst, src, jmm_tmp); + vmulps(dst, jmm_src, jmm_tmp); for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; i += (YMM_FLOAT_BLOCK * sizeof(float))) { vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 vaddps(dst, dst, jmm_tmp); - vmulps(dst, dst, src); + vmulps(dst, dst, jmm_src); } vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); vaddps(dst, dst, jmm_tmp); vmulps(dst, dst, ymm_z); - vaddps(dst, dst, src); + vaddps(dst, dst, jmm_src); vmovaps(jmm_tmp, ptr[reg_ptr_global]); vaddps(dst, dst, jmm_tmp); // build 2^n @@ -265,20 +261,23 @@ class VActJitCode : public JitCode { // compute sigmoid with ymm, xmm template - void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2, // NOLINT - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) { + void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { // y = 1 / (1 + e^-x) JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_src = JMM(src_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); - vminps(src, src, jmm_tmp); + vminps(jmm_src, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); - vmaxps(src, src, jmm_tmp); + vmaxps(jmm_src, jmm_src, jmm_tmp); vxorps(jmm_tmp, jmm_tmp, jmm_tmp); - vsubps(src, jmm_tmp, src); - exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vsubps(jmm_src, jmm_tmp, jmm_src); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(dst, dst, jmm_tmp); vdivps(dst, jmm_tmp, dst); @@ -287,19 +286,22 @@ class VActJitCode : public JitCode { // compute tanh with ymm, xmm template - void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT - int mask_idx = 4, int tmp_idx = 5) { + void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { // y = 2 / (1 + e^(-2x)) - 1 + JMM jmm_src = JMM(src_idx); JMM jmm_tmp = JMM(tmp_idx); JMM jmm_zero = JMM(mask_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); vxorps(jmm_zero, jmm_zero, jmm_zero); vsubps(jmm_tmp, jmm_zero, jmm_tmp); - vmulps(src, src, jmm_tmp); - exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmulps(jmm_src, jmm_src, jmm_tmp); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(dst, dst, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); @@ -309,6 +311,30 @@ class VActJitCode : public JitCode { pop(reg_ptr_global); } + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 11~15 + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, 15); + break; + case operand_type::exp: + exp_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::identity: + break; + default: + // throw error + break; + } + } + protected: int num_; operand_type type_; @@ -322,6 +348,148 @@ class VActJitCode : public JitCode { ymm_t ymm_dst = ymm_t(1); }; +class LSTMJitCode : public VActJitCode { + public: + const char* name() const override { + std::string base = "LSTMJitCode"; + if (use_peephole_) { + base += "_Peephole"; + } + if (compute_c1h1_) { + base += "_C1H1"; + } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + AddTypeStr(act_cell_); + return base.c_str(); + } + + explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr, + size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + compute_c1h1_(compute_c1h1) { + auto typeExchange = [](const std::string& type) -> gen::operand_type { + if (type == "sigmoid") { + return operand_type::sigmoid; + } else if (type == "relu") { + return operand_type::relu; + } else if (type == "tanh") { + return operand_type::tanh; + } else if (type == "identity" || type == "") { + return operand_type::identity; + } // else throw error + return operand_type::identity; + }; + num_ = attr.d; + use_peephole_ = attr.use_peephole; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + act_cell_ = typeExchange(attr.act_cell); + } + static bool init(int d); + void generate() override; + + protected: + int num_; + bool compute_c1h1_; + bool use_peephole_; + operand_type act_gate_; + operand_type act_cand_; + operand_type act_cell_; + reg64_t param1{abi_param1}; +}; + +class GRUJitCode : public VActJitCode { + public: + const char* name() const override { + std::string base = "GRUJitCode"; + if (id_ == 0) { + base += "_H1"; + } else if (id_ == 1) { + base += "_HtPart1"; + } else if (id_ == 2) { + base += "_HtPart2"; + } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + return base.c_str(); + } + + explicit GRUJitCode(int id, const gru_attr_t& attr, + size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + id_(id) { + auto typeExchange = [](const std::string& type) -> gen::operand_type { + if (type == "sigmoid") { + return operand_type::sigmoid; + } else if (type == "relu") { + return operand_type::relu; + } else if (type == "tanh") { + return operand_type::tanh; + } else if (type == "identity" || type == "") { + return operand_type::identity; + } // else throw error + return operand_type::identity; + }; + num_ = attr.d; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + } + static bool init(int d); + void generate() override; + + protected: + int id_; + int num_; + operand_type act_gate_; + operand_type act_cand_; + reg64_t param1{abi_param1}; +}; + #ifdef PADDLE_WITH_MKLDNN struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator { explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024) diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 82d808f415c3b4ed2688d034aad13610ae2ab0f4..b78b92b4f97b761654a5b9b178f96c1dc99f7789 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -17,6 +17,7 @@ limitations under the License. */ #include // for shared_ptr #include #include +#include "paddle/fluid/operators/math/jit_kernel_impl.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/macros.h" @@ -26,14 +27,7 @@ namespace operators { namespace math { namespace jitkernel { -// TODO(TJ): move these to some proper place -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - +// TODO(TJ): remove me typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; class Kernel { @@ -128,24 +122,18 @@ class VTanhKernel : public VActKernel {}; template class LSTMKernel : public Kernel { public: - virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht, - /* below only used in peephole*/ - const T *wp_data = nullptr, - T *checked = nullptr) const = 0; - // compute c1 and h1 without c0 or h0 - virtual void ComputeC1H1(T *gates, T *ct, T *ht, - /* below only used in peephole*/ - const T *wp_data = nullptr) const = 0; + void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *); + void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); }; template class GRUKernel : public Kernel { public: // compute h1 without h0 - virtual void ComputeH1(T *gates, T *ht) const = 0; - virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0; - virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0; + void (*ComputeH1)(gru_t *, const gru_attr_t *); + void (*ComputeHtPart1)(gru_t *, const gru_attr_t *); + void (*ComputeHtPart2)(gru_t *, const gru_attr_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index a143b51439f55d1f80d7936dfad46e31bd19f0cb..a0f93fd8e7eb7d81211724a6991a681e2a0ed9ce 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_XBYAK @@ -31,49 +32,6 @@ namespace math { namespace jitkernel { namespace jit = platform::jit; -template -void VMulRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template -void VAddRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - -template -void VAddReluRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} - -template -void VScalRefer(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] * x[i]; - } -} - -template -void VAddBiasRefer(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] + x[i]; - } -} - -template -void VReluRefer(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - #ifdef PADDLE_WITH_MKLML template void VMulMKL(const T* x, const T* y, T* z, int n); @@ -109,7 +67,7 @@ void VScalMKL(const float* a, const float* x, float* y, int n) { if (x == y) { platform::dynload::cblas_sscal(n, *a, y, 1); } else { - VScalRefer(a, x, y, n); + refer::VScal(a, x, y, n); } } @@ -118,7 +76,7 @@ void VScalMKL(const double* a, const double* x, double* y, int n) { if (x == y) { platform::dynload::cblas_dscal(n, *a, y, 1); } else { - VScalRefer(a, x, y, n); + refer::VScal(a, x, y, n); } } @@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel { return; } #endif - this->Compute = VMulRefer; + this->Compute = refer::VMul; } #ifdef PADDLE_WITH_XBYAK @@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel { return; } #endif - this->Compute = VAddRefer; + this->Compute = refer::VAdd; } #ifdef PADDLE_WITH_XBYAK @@ -280,7 +238,7 @@ class VAddReluKernelImpl : public VAddReluKernel { return; } #endif - this->Compute = VAddReluRefer; + this->Compute = refer::VAddRelu; } #ifdef PADDLE_WITH_XBYAK @@ -318,7 +276,7 @@ class VScalKernelImpl : public VScalKernel { return; } #endif - this->Compute = VScalRefer; + this->Compute = refer::VScal; } #ifdef PADDLE_WITH_XBYAK @@ -362,7 +320,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel { } #endif - this->Compute = VAddBiasRefer; + this->Compute = refer::VAddBias; } #ifdef PADDLE_WITH_XBYAK @@ -396,7 +354,7 @@ class VReluKernelImpl : public VReluKernel { } #endif - this->Compute = VReluRefer; + this->Compute = refer::VRelu; } #ifdef PADDLE_WITH_XBYAK @@ -412,16 +370,13 @@ bool VReluKernelImpl::useJIT(int d) { } #endif -template -inline void VIdentityRefer(const T* x, T* y, int n) {} - /* An empty JitKernel */ template class VIdentityKernelImpl : public VIdentityKernel { public: JITKERNEL_DECLARE_STATIC_FUNC; explicit VIdentityKernelImpl(int d) : VIdentityKernel() { - this->Compute = VIdentityRefer; + this->Compute = refer::VIdentity; } }; diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index f26815300de31c47a7ea341307b0051dee99e63b..686f3dd9836cb9192088771753065c6add639620 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" -#include // for exp #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #ifdef PADDLE_WITH_XBYAK #include "paddle/fluid/operators/math/jit_code.h" @@ -25,48 +25,12 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/mklml.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { namespace jitkernel { namespace jit = platform::jit; -// TODO(TJ): move refer codes to one file -// Refer code only focus on correctness -template -void VExpRefer(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -void VSigmoidRefer(const T* x, T* y, int n) { - // y = 1 / (1 + e^-x) - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); - } -} - -template -void VTanhRefer(const T* x, T* y, int n) { - // y = 2 * sigmoid(2x) - 1 - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoidRefer(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup template @@ -129,7 +93,7 @@ class VExpKernelImpl : public VExpKernel { return; } #endif - this->Compute = VExpRefer; + this->Compute = refer::VExp; } #ifdef PADDLE_WITH_XBYAK @@ -182,7 +146,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel { return; } #endif - this->Compute = VSigmoidRefer; + this->Compute = refer::VSigmoid; } #ifdef PADDLE_WITH_XBYAK @@ -234,7 +198,7 @@ class VTanhKernelImpl : public VTanhKernel { return; } #endif - this->Compute = VTanhRefer; + this->Compute = refer::VTanh; } #ifdef PADDLE_WITH_XBYAK @@ -267,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel); REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); REGISTER_JITKERNEL(vtanh, VTanhKernel); -namespace detail { - -#ifdef __AVX__ - -#define ALIGN32 __attribute__((aligned(32))) - -#define _PS256_CONST(Name, Val) \ - static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ - Val, Val, Val, Val} - -#define _PI256_CONST(Name, Val) \ - static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ - Val, Val, Val, Val} - -_PI256_CONST(0x7f, 0x7f); -_PS256_CONST(one, 1.f); -_PS256_CONST(0p5, 0.5f); -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS256_CONST(cephes_exp_C1, 0.693359375); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4); -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); - -typedef union imm_xmm_union { - __m256i imm; - __m128i xmm[2]; -} imm_xmm_union; - -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ - { \ - imm_xmm_union u ALIGN32; \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ - } - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ - { \ - imm_xmm_union u ALIGN32; \ - u.xmm[0] = xmm0_; \ - u.xmm[1] = xmm1_; \ - imm_ = u.imm; \ - } - -#define AVX2_BITOP_USING_SSE2(fn) \ - static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \ - /* use SSE2 to perform the bitop AVX2 */ \ - __m128i x1, x2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1, y); \ - x2 = _mm_##fn(x2, y); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return ret; \ - } - -#define AVX2_INTOP_USING_SSE2(fn) \ - static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \ - /* use SSE2 to perform the AVX2 integer operation */ \ - __m128i x1, x2; \ - __m128i y1, y2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1, y1); \ - x2 = _mm_##fn(x2, y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return ret; \ - } - -AVX2_BITOP_USING_SSE2(slli_epi32); -AVX2_INTOP_USING_SSE2(add_epi32); - -#define AVXEXP_BASE \ - __m256 tmp = _mm256_setzero_ps(), fx; \ - __m256 one = *reinterpret_cast(_ps256_one); \ - __m256i imm0; \ - x = _mm256_min_ps(x, *reinterpret_cast(_ps256_exp_hi)); \ - x = _mm256_max_ps(x, *reinterpret_cast(_ps256_exp_lo)); \ - /* express exp(x) as exp(g + n*log(2)) */ \ - fx = _mm256_mul_ps(x, \ - *reinterpret_cast(_ps256_cephes_LOG2EF)); \ - fx = _mm256_add_ps(fx, *reinterpret_cast(_ps256_0p5)); \ - tmp = _mm256_floor_ps(fx); \ - /* if greater, substract 1 */ \ - __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); \ - mask = _mm256_and_ps(mask, one); \ - fx = _mm256_sub_ps(tmp, mask); \ - tmp = _mm256_mul_ps(fx, \ - *reinterpret_cast(_ps256_cephes_exp_C1)); \ - __m256 z = _mm256_mul_ps( \ - fx, *reinterpret_cast(_ps256_cephes_exp_C2)); \ - x = _mm256_sub_ps(x, tmp); \ - x = _mm256_sub_ps(x, z); \ - z = _mm256_mul_ps(x, x); \ - __m256 y = *reinterpret_cast(_ps256_cephes_exp_p0); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p1)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p2)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p3)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p4)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p5)); \ - y = _mm256_mul_ps(y, z); \ - y = _mm256_add_ps(y, x); \ - y = _mm256_add_ps(y, one); \ - /* build 2^n */ \ - imm0 = _mm256_cvttps_epi32(fx) - -__m256 ExpAVX(__m256 x) { - AVXEXP_BASE; - // two AVX2 instructions using SSE2 - imm0 = avx2_mm256_add_epi32(imm0, - *reinterpret_cast(_pi256_0x7f)); - imm0 = avx2_mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} -#endif - -#ifdef __AVX2__ -__m256 ExpAVX2(__m256 x) { - AVXEXP_BASE; - // two AVX2 instructions - imm0 = _mm256_add_epi32(imm0, *reinterpret_cast(_pi256_0x7f)); - imm0 = _mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} -#endif - -} // namespace detail } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..ba5f20e53383d3cafab4239f1a2d911addf1ae23 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +typedef struct { + void* gates; // gates: W_ch, W_ih, W_fh, W_oh + const void* ct_1; + void* ct; + void* ht; + /* weight_peephole and checked data are only used in peephole*/ + const void* wp{nullptr}; + void* checked{nullptr}; +} lstm_t; + +typedef struct { + void* gates; // gates: {W_update, W_reset; W_state} + const void* ht_1; + void* ht; +} gru_t; + +struct rnn_attr_s { + int d; + std::string act_gate, act_cand; + rnn_attr_s() = default; + rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand) + : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} +}; + +struct lstm_attr_s : public rnn_attr_s { + bool use_peephole; + std::string act_cell; + lstm_attr_s() = default; + lstm_attr_s(int _d, const std::string& _act_gate, + const std::string& _act_cand, const std::string& _act_cell, + bool _use_peephole = false) + : rnn_attr_s(_d, _act_gate, _act_cand), + use_peephole(_use_peephole), + act_cell(_act_cell) {} +}; + +typedef struct rnn_attr_s gru_attr_t; +typedef struct lstm_attr_s lstm_attr_t; + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 8acf60cfbfd3d47ad52862241b7635aba6982ebf..5a3efd979f803d396a5084c199b1d71b88a77126 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -82,10 +82,10 @@ namespace jitkernel { #define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \ marco_declare, macro_find_key, macro_impl) \ marco_define_name(ker_key, ker_class); \ - REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE, \ - JITKERNEL_FIND_KEY, JITKERNEL_IMPL); \ - REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE, \ - JITKERNEL_FIND_KEY, JITKERNEL_IMPL) + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare, \ + macro_find_key, macro_impl); \ + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare, \ + macro_find_key, macro_impl) #define REGISTER_JITKERNEL(ker_key, ker_class) \ REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \ diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h new file mode 100644 index 0000000000000000000000000000000000000000..e0b2e3c7fada6b422318c68a42fd6d103c99af5a --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -0,0 +1,238 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/operators/math/jit_kernel_impl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace refer { +/* Refer code only focus on correctness */ + +template +void VMul(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +template +void VAdd(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +template +void VAddRelu(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } +} + +template +void VScal(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] * x[i]; + } +} + +template +void VAddBias(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] + x[i]; + } +} + +template +void VRelu(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template +inline void VIdentity(const T* x, T* y, int n) {} + +template +void VExp(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +void VSigmoid(const T* x, T* y, int n) { + // y = 1 / (1 + e^-x) + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); + } +} + +template +void VTanh(const T* x, T* y, int n) { + // y = 2 * sigmoid(2x) - 1 + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + +template +void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT + if (type == "sigmoid") { + return VSigmoid; + } else if (type == "relu") { + return VRelu; + } else if (type == "tanh") { + return VTanh; + } else if (type == "identity" || type == "") { + return VIdentity; + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +// compute ct and ht +template +void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + const T* wp = reinterpret_cast(step->wp); + T* checked = reinterpret_cast(step->checked); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + // gates: W_ch, W_ih, W_fh, W_oh + if (attr->use_peephole) { + VMul(wp, ct_1, checked, d); + VMul(wp + d, ct_1, checked + d, d); + VAdd(checked, gates + d, gates + d, d2); + act_gate(gates + d, gates + d, d2); + } else { + act_gate(gates + d, gates + d, d3); + } + + // C_t = C_t-1 * fgated + cand_gated * igated + act_cand(gates, gates, d); + VMul(gates, gates + d, gates + d, d); + VMul(ct_1, gates + d2, gates + d2, d); + VAdd(gates + d, gates + d2, ct, d); + + if (attr->use_peephole) { + // get ogated + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + act_gate(gates + d3, gates + d3, d); + } + // H_t = act_cell(C_t) * ogated + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + +// compute c1 and h1 without c0 or h0 +template +void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + /* C_t = igated * cgated*/ + act_gate(gates + d, gates + d, d); + act_cand(gates, gates, d); + VMul(gates, gates + d, ct, d); + if (attr->use_peephole) { + // get outgated, put W_oc * C_t on igated + const T* wp = reinterpret_cast(step->wp); + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + } + /* H_t = act_cell(C_t) * ogated */ + act_gate(gates + d3, gates + d3, d); + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + +// compute h1 without h0 +template +void GRUH1(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + int d2 = d * 2; + act_gate(gates, gates, d); + act_cand(gates + d2, gates + d2, d); + VMul(gates, gates + d2, ht, d); +} + +// compute the first part of GRU: ht = act_gate(r) * ht_1 +template +void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { + // W: {W_update, W_reset; W_state} + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + act_gate(gates + attr->d, gates + attr->d, attr->d); + VMul(ht_1, gates + attr->d, ht, attr->d); +} + +// compute the second part of GRU: +// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 +template +void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + T* y = gates + d * 2; + act_gate(gates, gates, d); + act_cand(y, y, d); + // out = zt*ht~ + (1-zt)*ht_1 + for (int i = 0; i < d; ++i) { + ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + } +} + +} // namespace refer +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index e79b0400ab75d1488a26450bd8cde4a0979fc995..2db3274a45610aedea385baf650b8efb42ac39d0 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -15,470 +15,248 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" -#ifdef __AVX__ -#include +#ifdef PADDLE_WITH_XBYAK +#include "paddle/fluid/operators/math/jit_code.h" #endif namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace detail { -#ifdef __AVX__ -__m256 ExpAVX(__m256 x); -#endif - -#ifdef __AVX2__ -__m256 ExpAVX2(__m256 x); -#endif - -} // namespace detail - -namespace jit = platform::jit; -#ifdef __AVX__ -typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type; - -class AVXAct { - public: - virtual ~AVXAct() = default; - virtual __m256 Compute(__m256 x) const = 0; -}; - -template -class AVXActImpl : public AVXAct { +/* LSTM JitKernel */ +template +class LSTMKernelImpl : public LSTMKernel { public: - __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } -}; - -#define AVX_SIGMOID(isa, expisa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - __m256 ones = _mm256_set1_ps(1.0f); \ - x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \ - x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \ - x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); \ - x = expisa(x); \ - x = _mm256_add_ps(ones, x); \ - return _mm256_div_ps(ones, x); \ - } - -#define AVX_TANH(isa, expisa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - __m256 ones = _mm256_set1_ps(1.0f); \ - x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); \ - x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); \ - x = expisa(x); \ - x = _mm256_add_ps(ones, x); \ - x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); \ - return _mm256_sub_ps(x, ones); \ - } - -#define AVX_RELU(isa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - return _mm256_max_ps(x, _mm256_setzero_ps()); \ + static inline std::string name(const lstm_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; + jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); + this->ComputeCtHt = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096)); + this->ComputeC1H1 = + jitcode1_->getCode(); + return; + } +#endif -#define AVX_IDENTITY(isa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - return x; \ + this->ComputeCtHt = refer::LSTMCtHt; + this->ComputeC1H1 = refer::LSTMC1H1; } -#define FOR_EACH_AVX_ISA(macro_) \ - macro_(jit::avx); \ - macro_(jit::avx2); \ - macro_(jit::avx512f) - -FOR_EACH_AVX_ISA(AVX_RELU); -FOR_EACH_AVX_ISA(AVX_IDENTITY); - -AVX_SIGMOID(jit::avx, detail::ExpAVX); -AVX_TANH(jit::avx, detail::ExpAVX); +#ifdef PADDLE_WITH_XBYAK -#ifdef __AVX2__ -AVX_SIGMOID(jit::avx2, detail::ExpAVX2); -AVX_SIGMOID(jit::avx512f, detail::ExpAVX2); -AVX_TANH(jit::avx2, detail::ExpAVX2); -AVX_TANH(jit::avx512f, detail::ExpAVX2); + private: + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}; #endif +}; -#undef FOR_EACH_AVX_ISA -#undef AVX_IDENTITY -#undef AVX_RELU -#undef AVX_TANH -#undef AVX_SIGMOID - +#ifdef PADDLE_WITH_XBYAK +template <> +bool LSTMKernelImpl::useJIT(int d) { + return gen::LSTMJitCode::init(d); +} #endif +/* Peephole JitKernel */ template -static std::shared_ptr> GetActKernel( - const std::string& type, int n) { - if (type == "sigmoid") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "relu") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "tanh") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "identity" || type == "") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} - -#ifdef __AVX__ -template -static std::unique_ptr GetAVXAct(const std::string& type) { - if (type == "sigmoid") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "relu") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "tanh") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "identity" || type == "") { - return std::unique_ptr(new AVXActImpl()); +class PeepholeKernelImpl : public LSTMKernel { + public: + static inline std::string name(const lstm_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8; + jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); + this->ComputeCtHt = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096)); + this->ComputeC1H1 = + jitcode1_->getCode(); + return; + } #endif -/* LSTM JitKernel */ -template -class LSTMKernelImpl : public LSTMKernel { - public: - explicit LSTMKernelImpl(const std::string& act_gate, - const std::string& act_cand, - const std::string& act_cell, int d) - : LSTMKernel() { - d_ = d; - d2_ = d * 2; - d3_ = d * 3; - act_gate_d3_ = GetActKernel(act_gate, d3_); - act_gate_d_ = GetActKernel(act_gate, d); - act_cand_d_ = GetActKernel(act_cand, d); - act_cell_d_ = GetActKernel(act_cell, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - vadd_d_ = KernelPool::Instance().template Get>(d); + this->ComputeCtHt = refer::LSTMCtHt; + this->ComputeC1H1 = refer::LSTMC1H1; } - void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, - T* checked) const override { - // gates: W_ch, W_ih, W_fh, W_oh - act_gate_d3_->Compute(gates + d_, gates + d_, d3_); - - /* C_t = C_t-1 * fgated + cand_gated * igated */ - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, gates + d_, d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); - - /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } - void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { - /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_, d_); - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, ct, d_); - /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } +#ifdef PADDLE_WITH_XBYAK private: - int d_, d2_, d3_; - std::shared_ptr> act_gate_d3_, act_gate_d_, act_cand_d_, - act_cell_d_; - std::shared_ptr> vmul_d_; - std::shared_ptr> vadd_d_; -#ifdef __AVX__ - std::unique_ptr avx_act_gate_, avx_act_cand_, avx_act_cell_; + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}; #endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - LSTMKernelImpl::LSTMKernelImpl( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d) \ - : LSTMKernel() { \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_cand_ = GetAVXAct(act_cand); \ - avx_act_cell_ = GetAVXAct(act_cell); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeCtHt( \ - float* gates, const float* ct_1, float* ct, float* ht, \ - const float* wp_data, float* checked) const { \ - /* gates: W_ch, W_ih, W_fh, W_oh */ \ - __m256 c, i, f, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - f = _mm256_loadu_ps(gates + 16); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ - i = _mm256_loadu_ps(ct_1); \ - f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ - f = _mm256_add_ps(c, f); \ - _mm256_storeu_ps(ct, f); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeC1H1( \ - float* gates, float* ct, float* ht, const float* wp_data) const { \ - __m256 c, i, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = igated * cgated*/ \ - c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ - _mm256_storeu_ps(ct, c); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } - -// TODO(TJ): optimize keq16 - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); +#ifdef PADDLE_WITH_XBYAK +template <> +bool PeepholeKernelImpl::useJIT(int d) { + return gen::LSTMJitCode::init(d); +} #endif -/* Peephole JitKernel */ -template -class PeepholeKernelImpl : public LSTMKernel { - public: - explicit PeepholeKernelImpl(const std::string& act_gate, - const std::string& act_cand, - const std::string& act_cell, int d) - : LSTMKernel() { - d_ = d; - d2_ = d * 2; - d3_ = d * 3; - act_gate_d_ = GetActKernel(act_gate, d); - act_cand_d_ = GetActKernel(act_cand, d); - act_cell_d_ = GetActKernel(act_cell, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - vadd_d_ = KernelPool::Instance().template Get>(d); - vadd_d2_ = KernelPool::Instance().template Get>(d2_); - act_gate_d2_ = GetActKernel(act_gate, d2_); +#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(const lstm_attr_t& attr) { \ + std::string key(#ker_key "f"); \ + key += (attr.act_gate + attr.act_cand + attr.act_cell + \ + (attr.use_peephole ? "p" : "n")); \ + if (useJIT(attr.d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(attr.d); \ + } else if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(const lstm_attr_t& attr) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ } - void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, - T* checked) const override { - /* get fgated and igated*/ - vmul_d_->Compute(wp_data, ct_1, checked, d_); - vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); - vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_); - act_gate_d2_->Compute(gates + d_, gates + d_, d2_); - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, gates + d_, d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); - /* get ogated*/ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); +#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const lstm_attr_t&>( \ + const lstm_attr_t& attr) + +#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(attr) + +#define JITKERNEL_LSTM_IMPL(ker, dtype) \ + if (attr.use_peephole) { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); \ + } else { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); \ } - void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { - /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_, d_); - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, ct, d_); - /* get outgated, put W_oc * C_t on igated */ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } +REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM, + JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM, + JITKERNEL_LSTM_IMPL); - private: - int d_, d2_, d3_; - std::shared_ptr> act_gate_d2_, act_gate_d_, act_cand_d_, - act_cell_d_; - std::shared_ptr> vmul_d_; - std::shared_ptr> vadd_d_, vadd_d2_; -}; - -#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> \ - KernelPool::Get, const std::string&, \ - const std::string&, const std::string&, int, bool>( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d, bool use_peephole) - -#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \ - #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \ - (use_peephole ? "p" : "n") - -#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \ - if (use_peephole) { \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>( \ - act_gate, act_cand, act_cell, d)); \ - } else { \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(act_gate, act_cand, \ - act_cell, d)); \ - } - -REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, - JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); - -#undef INTRI8_FLOAT +#undef JITKERNEL_LSTM_IMPL +#undef JITKERNEL_FIND_KEY_LSTM #undef JITKERNEL_DECLARE_LSTM -#undef JITKERNEL_KEY_LSTM -#undef JITKERNEL_NEW_LSTM_IMPL +#undef JITKERNEL_DEFINE_NAME_LSTM /* GRU JitKernel */ -template +template class GRUKernelImpl : public GRUKernel { public: - explicit GRUKernelImpl(const std::string& act_gate, - const std::string& act_state, int d) - : GRUKernel() { - d_ = d; - d2_ = d * 2; - act_gate_d2_ = GetActKernel(act_gate, d2_); - act_gate_d_ = GetActKernel(act_gate, d); - act_state_d_ = GetActKernel(act_state, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - } - - void ComputeH1(T* gates, T* ht) const override { - act_gate_d_->Compute(gates, gates, d_); - act_state_d_->Compute(gates + d2_, gates + d2_, d_); - vmul_d_->Compute(gates, gates + d2_, ht, d_); + static inline std::string name(const gru_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } - - void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { - // W: {W_update, W_reset; W_state} - act_gate_d2_->Compute(gates, gates, d2_); - vmul_d_->Compute(ht_1, gates + d_, ht, d_); - } - - void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { - T* y = gates + d2_; - act_state_d_->Compute(y, y, d_); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d_; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; + jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096)); + this->ComputeH1 = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096)); + this->ComputeHtPart1 = + jitcode1_->getCode(); + + jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096)); + this->ComputeHtPart2 = + jitcode2_->getCode(); + return; } +#endif + this->ComputeH1 = refer::GRUH1; + this->ComputeHtPart1 = refer::GRUHtPart1; + this->ComputeHtPart2 = refer::GRUHtPart2; } +#ifdef PADDLE_WITH_XBYAK private: - int d_, d2_; - std::shared_ptr> act_gate_d2_, act_gate_d_, act_state_d_; - std::shared_ptr> vmul_d_; -#ifdef __AVX__ - std::unique_ptr avx_act_gate_, avx_act_state_; + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}, + jitcode2_{nullptr}; #endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - GRUKernelImpl::GRUKernelImpl( \ - const std::string& act_gate, const std::string& act_state, int d) \ - : GRUKernel() { \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_state_ = GetAVXAct(act_state); \ - } \ - template <> \ - void GRUKernelImpl::ComputeH1(float* gates, float* ht) \ - const { \ - __m256 u, s; \ - /* W: {W_update, W_reset; W_state} */ \ - u = _mm256_loadu_ps(gates); \ - s = _mm256_loadu_ps(gates + 16); \ - s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \ - _mm256_storeu_ps(ht, s); \ - } \ - template <> \ - void GRUKernelImpl::ComputeHtPart1( \ - float* gates, const float* ht_1, float* ht) const { \ - /* not exactly equal the any implementation */ \ - __m256 r, ht0; \ - r = _mm256_loadu_ps(gates + 8); \ - ht0 = _mm256_loadu_ps(ht_1); \ - r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \ - _mm256_storeu_ps(ht, r); \ - } \ - template <> \ - void GRUKernelImpl::ComputeHtPart2( \ - float* gates, const float* ht_1, float* ht) const { \ - /* not exactly equal the any implementation */ \ - __m256 u, s, ht0; \ - u = _mm256_loadu_ps(gates); \ - s = _mm256_loadu_ps(gates + 16); \ - ht0 = _mm256_loadu_ps(ht_1); \ - u = avx_act_gate_->Compute(u); \ - s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \ - u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \ - u = _mm256_mul_ps(u, ht0); \ - u = _mm256_add_ps(s, u); \ - _mm256_storeu_ps(ht, u); \ - } - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); +#ifdef PADDLE_WITH_XBYAK +template <> +bool GRUKernelImpl::useJIT(int d) { + return gen::GRUJitCode::init(d); +} #endif -#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> KernelPool::Get< \ - GRUKernel, const std::string&, const std::string&, int>( \ - const std::string& act_gate, const std::string& act_state, int d) +#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(const gru_attr_t& attr) { \ + std::string key(#ker_key "f"); \ + key += (attr.act_gate + attr.act_cand); \ + if (useJIT(attr.d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(attr.d); \ + } else if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(const gru_attr_t& attr) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } + +#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const gru_attr_t&>( \ + const gru_attr_t& attr) -#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \ - #ker_key #dtype_key + std::to_string(d) + act_gate + act_state +#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(attr) -#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(act_gate, act_state, d)); +#define JITKERNEL_GRU_IMPL(ker, dtype) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); -REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU, - JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); +REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU, + JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU, + JITKERNEL_GRU_IMPL); -#undef INTRI8_FLOAT -#undef JITKERNEL_NEW_GRU_IMPL -#undef JITKERNEL_KEY_GRU +#undef JITKERNEL_GRU_IMPL +#undef JITKERNEL_FIND_KEY_GRU #undef JITKERNEL_DECLARE_GRU +#undef JITKERNEL_DEFINE_NAME_GRU } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index b6c62a26348cdc20582cf7465f93026402051587..2e5072ca790a00cf0185adf32d64a3b392ff0019 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" -#include #include // for exp #include // for memcpy #include @@ -22,6 +21,8 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" +#include "paddle/fluid/platform/port.h" #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" @@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), } } -void vrelu_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0.f ? x[i] : 0.f; - } -} - #if defined __AVX__ || defined __AVX2__ void vrelu_intri8(const int n, const float* x, float* y) { __m256 tmp = _mm256_loadu_ps(x); @@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) { TEST(JitKernel, vrelu) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vrelu_ref(d, x_data, zref_data); + refer::VRelu(x_data, zref_data, d); } auto trefe = GetCurrentUS(); #if defined __AVX__ || defined __AVX2__ @@ -90,7 +86,7 @@ TEST(JitKernel, vrelu) { vrelu_intri8(d, x_data, zref_data); } auto si1 = GetCurrentUS(); - VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat << " us"; } #endif auto ttgts = GetCurrentUS(); @@ -100,21 +96,16 @@ TEST(JitKernel, vrelu) { auto ttgte = GetCurrentUS(); VLOG(30) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -void vaddbias_ref(const int n, const float a, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] + a; - } -} - TEST(JitKernel, vaddbias) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddbias_ref(d, a, x_data, zref_data); + refer::VAddBias(&a, x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -138,19 +129,13 @@ TEST(JitKernel, vaddbias) { VLOG(30) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -void vexp_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - #ifdef PADDLE_WITH_MKLML void vexp_mkl(const int n, const float* x, float* y) { paddle::platform::dynload::vsExp(n, x, y); @@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) { TEST(JitKernel, vexp) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -170,7 +156,7 @@ TEST(JitKernel, vexp) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vexp_ref(d, x_data, zref_data); + refer::VExp(x_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -196,26 +182,13 @@ TEST(JitKernel, vexp) { #else << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -inline float _sigmoid(float x) { - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - float tmp = (x < min) ? min : ((x > max) ? max : x); - return 1.f / (1.f + std::exp(-tmp)); -} - -void vsigmoid_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = _sigmoid(x[i]); - } -} - void vsigmoid_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VExpKernel>& vexp, @@ -234,6 +207,7 @@ void vsigmoid_better( TEST(JitKernel, vsigmoid) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vsigmoid_ref(d, x_data, zref_data); + refer::VSigmoid(x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -264,21 +238,13 @@ TEST(JitKernel, vsigmoid) { VLOG(30) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; } - -void vtanh_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = _tanh(x[i]); - } -} - void vtanh_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VScalKernel>& vscal, @@ -298,6 +264,7 @@ void vtanh_better( TEST(JitKernel, vtanh) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vtanh_ref(d, x_data, zref_data); + refer::VTanh(x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -332,39 +299,13 @@ TEST(JitKernel, vtanh) { VLOG(30) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -void lstm_ctht_ref( - const std::shared_ptr< - const paddle::operators::math::jitkernel::VSigmoidKernel>& - vsigmoid_3d, - const std::shared_ptr< - const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, - const std::shared_ptr< - const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, - const int d, float* gates, const float* ct_1, float* ct, float* ht) { - vsigmoid_3d->Compute(gates + d, gates + d, 3 * d); - vtanh_d->Compute(gates, gates, d); - const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - for (int k = 0; k < d; ++k) { - // C_t = C_t-1 * fgated + cand_gated * igated - ct[k] = ct_1[k] * f[k] + gates[k] * i[k]; - // H_t = act_cell(C_t) * ogated - float tmp = ct[k] * 2; - tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vexp_1->Compute(&tmp, &tmp, 1); - tmp = 2.f / (1.f + tmp) - 1.f; - ht[k] = tmp * o[k]; - } -} - void lstm_ctht_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VSigmoidKernel>& @@ -389,6 +330,7 @@ void lstm_ctht_better( TEST(JitKernel, lstm) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) { int d4 = d * 4; int d3 = d * 3; @@ -399,19 +341,17 @@ TEST(JitKernel, lstm) { RandomVec(d, ct_1.data(), -2.f, 2.f); memcpy(xref.data(), x.data(), sizeof(float) * d4); std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false); const auto& ker = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, d, false); + .template Get, const jit::lstm_attr_t&>( + attr); // below kernels are used to compute refer const auto& vsigmoid_3d = jit::KernelPool::Instance().template Get>( d3); const auto& vtanh_d = jit::KernelPool::Instance().template Get>(d); - const auto& vexp_1 = - jit::KernelPool::Instance().template Get>(1); const auto& vmul_d = jit::KernelPool::Instance().template Get>(d); const auto& vadd_d = @@ -425,9 +365,17 @@ TEST(JitKernel, lstm) { float* ct_ref_data = ct_ref.data(); float* ht_ref_data = ht_ref.data(); // compute once to check correctness - lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, - ct_ref_data, ht_ref_data); - ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + jit::lstm_t step; + step.gates = xref_data; + step.ct_1 = ct_1_data; + step.ct = ct_ref_data; + step.ht = ht_ref_data; + refer::LSTMCtHt(&step, &attr); + + step.gates = x_data; + step.ct = ct_tgt_data; + step.ht = ht_tgt_data; + ker->ComputeCtHt(&step, &attr); for (int i = 0; i < d; ++i) { EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3); @@ -441,32 +389,21 @@ TEST(JitKernel, lstm) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, - ct_ref_data, ht_ref_data); + refer::LSTMCtHt(&step, &attr); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + ker->ComputeCtHt(&step, &attr); } auto ttgte = GetCurrentUS(); VLOG(30) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat << " us, better(jit) takes: " << (tmkle - tmkls) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us"; } } -void vscal_ref(const int n, const float a, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = a * x[i]; - } -} -void vscal_inp_ref(const int n, const float a, float* x) { - for (int i = 0; i < n; ++i) { - x[i] = a * x[i]; - } -} #if defined __AVX__ || defined __AVX2__ void vscal_intri8(const int n, const float a, const float* x, float* y) { __m256 tmp; @@ -492,6 +429,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) { TEST(JitKernel, vscal) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -506,12 +444,12 @@ TEST(JitKernel, vscal) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vscal_ref(d, a, x_data, zref_data); + refer::VScal(&a, x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto trefs1 = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vscal_inp_ref(d, a, y_data); + refer::VScal(&a, y_data, y_data, d); } auto trefe1 = GetCurrentUS(); @@ -536,7 +474,7 @@ TEST(JitKernel, vscal) { } auto si3 = GetCurrentUS(); VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat - << " us, inplace: " << (si3 - si2) / repeat; + << " us, inplace: " << (si3 - si2) / repeat << " us"; } #endif @@ -560,19 +498,14 @@ TEST(JitKernel, vscal) { << " us, " #endif << "tgt takes: " << (ttgte - ttgts) / repeat - << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; + << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat + << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -void vmul_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - #if defined __AVX__ || defined __AVX2__ void vmul_intri8(const int n, const float* x, const float* y, float* z) { __m256 tmpx, tmpy; @@ -591,6 +524,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vmul) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -604,7 +538,7 @@ TEST(JitKernel, vmul) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vmul_ref(d, x_data, y_data, zref_data); + refer::VMul(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -640,19 +574,13 @@ TEST(JitKernel, vmul) { #else << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -void vadd_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - #if defined __AVX__ || defined __AVX2__ void vadd_intri8(const int n, const float* x, const float* y, float* z) { __m256 tmpx, tmpy; @@ -671,6 +599,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vadd) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -684,7 +613,7 @@ TEST(JitKernel, vadd) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vadd_ref(d, x_data, y_data, zref_data); + refer::VAdd(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -720,19 +649,13 @@ TEST(JitKernel, vadd) { #else << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } } } -void vaddrelu_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} void vaddrelu_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VAddKernel>& vadd, @@ -745,6 +668,7 @@ void vaddrelu_better( TEST(JitKernel, vaddrelu) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -762,7 +686,7 @@ TEST(JitKernel, vaddrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddrelu_ref(d, x_data, y_data, zref_data); + refer::VAddRelu(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); auto tmkls = GetCurrentUS(); @@ -778,7 +702,7 @@ TEST(JitKernel, vaddrelu) { VLOG(30) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat << " us, better takes: " << (tmkle - tmkls) / repeat << " us, " - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat << " us"; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -789,21 +713,23 @@ TEST(JitKernel, pool) { namespace jit = paddle::operators::math::jitkernel; const int frame_size = 4; std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); + + // empty call it to avoid unknown flag 'use_pinned_memory' on Mac + paddle::platform::jit::MayIUse(paddle::platform::jit::avx); const auto& plstm1 = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, false); + .template Get, const jit::lstm_attr_t&>(attr); + const auto& plstm2 = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, false); + .template Get, const jit::lstm_attr_t&>(attr); + EXPECT_EQ(plstm1, plstm2); + const auto& peephole = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, true); + .template Get, const jit::lstm_attr_t&>( + jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true)); EXPECT_TRUE(plstm1 != peephole); const auto& pvmul_f = diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 19b2c68c823adbed82319f7b04992baedd5d41f9..76cc796a9b8e21849b1d86e512cd70752fd027ac 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel { math::scatter::MergeAdd merge_add; merge_add(context.template device_context(), inputs, out); + + out->SyncIndex(); + } else { // no data, just set a empty out tensor. out->mutable_value()->mutable_data(framework::make_ddim({0}), diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 96dc123f6a36e1a2b6ae04e0d97dffe1e10ac4ea..58a74ec2c104f66e9e884cffd00e7fa6622e4714 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { out_inx_dim[0] = inx.size(); out_inx.Resize(out_inx_dim); + auto &local_scope = scope.NewScope(); std::string var_name = "out_index"; - framework::Variable *tmp_index_var = - const_cast(scope).Var(var_name); + framework::Variable *tmp_index_var = local_scope.Var(var_name); auto &tmp_index_tensor = *(tmp_index_var->GetMutable()); tmp_index_tensor.Resize(out_inx_dim); @@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { out_dims[axis] = out_dim_sum; out.Resize(out_dims); - LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names); - // Invoke Reshape Op + LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names); + // Invoke concat Op auto concat_op = framework::OpRegistry::CreateOp( "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs); - concat_op->Run(scope, place); + concat_op->Run(local_scope, place); } }; diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index f2d691b2931f5a57e70fd4762e9dea5665ed75c2..b737a6c38d0441cbfcd55ed2c55969ceca68db5d 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) { #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); - omp_set_num_threads(num_threads); + omp_set_num_threads(real_num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 682b0c0ff39b71f08fe1a8b0c9c7b7d386b67738..61a25064d17994e3ce5853017263f24a859c69be 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { #define CUDNN_ENFORCE(condition) \ do { \ - cudnnStatus_t status = condition; \ + auto status = condition; \ if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \ PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ } \ diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 9a9018cdea6a9dcdebe20fd0faef8ff3d4e0e2a1..3edd727978010e20203ab994562ce922b6ee0bad 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -143,6 +143,39 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; +#if CUDA_VERSION >= 9000 +class ScopedCublasMathMode { + public: + ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) + : handle_(handle) { + need_reset = false; + PADDLE_ENFORCE( + platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), + "Failed to get old cublas math mode"); + if (old_math_mode_ != new_math_mode) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, new_math_mode), + "Failed to set old cublas math mode"); + need_reset = true; + } + } + + ~ScopedCublasMathMode() { + if (need_reset) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, old_math_mode_), + "Failed to set old cublas math mode"); + } + } + + private: + cublasHandle_t handle_; + cublasMath_t old_math_mode_; + bool need_reset; +}; + +#endif + class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext { callback_manager_->Wait(); } +#if CUDA_VERSION >= 9000 + /*! \brief CublasCall may need to change cublas's config, + * but the cublas may be hold by multi-thread, so we should + * add lock here. */ + template + void CublasCall(Callback callback, cublasMath_t new_math) { + std::lock_guard guard(cublas_mtx_); + ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); + callback(); + } +#endif + private: CUDAPlace place_; @@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext { // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes mutable std::mutex callback_mtx_; std::unique_ptr callback_manager_; + + mutable std::mutex cublas_mtx_; }; template <> diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc index 361d3439b844e9f68d3fba0a0e41ec457118a4a9..41648c32fe6f98bb0b78ea7891065e5586f70463 100644 --- a/paddle/fluid/platform/dynload/cublas.cc +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); #endif +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4 +CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 4ea0cd7283b55649dbdbbf97f81f10c69ac6a1d2..ced789b90d067218c3b01d124cfd2c93dc94e528 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -61,9 +61,6 @@ extern void *cublas_dso_handle; extern DynLoad__##__name __name #endif -#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ - DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) - #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ __macro(cublasSaxpy_v2); \ __macro(cublasDaxpy_v2); \ @@ -106,11 +103,22 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode); +#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + __macro(cublasSetMathMode); \ + __macro(cublasGetMathMode); CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif +// APIs available after CUDA 9.1 +#if CUDA_VERSION >= 9010 +#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(cublasGemmBatchedEx); \ + __macro(cublasGemmStridedBatchedEx); + +CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#endif + #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 1a83ac7780a01fd3c20bc85baaf14e6ca3f8eb8c..db62377898339def415a13d185f85f34de326d7f 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -48,13 +48,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #else -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline cudnnStatus_t operator()(Args... args) { \ - return ::__name(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ extern DynLoad__##__name __name #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c78f159ad25a17b38333a57a0650d9843c4c5632..20bd349b94bb211b62a0dee32bcad79759fb500f 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -19,13 +19,32 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" -DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, +#ifndef _WIN32 +const float fraction_of_gpu_memory_to_use = 0.92f; +#else +// fraction_of_gpu_memory_to_use cannot be too high on windows, +// since the win32 graphic sub-system can occupy some GPU memory +// which may lead to insufficient memory left for paddle +const float fraction_of_gpu_memory_to_use = 0.5f; +#endif + +DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "Allocate a trunk of gpu memory that is this fraction of the " "total gpu memory size. Future memory usage will be allocated " "from the trunk. If the trunk doesn't have enough gpu memory, " "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_bool( + enable_cublas_tensor_op_math, false, + "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " + "but it may loss precision. Currently, There are two CUDA libraries that" + " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up" + " GEMM computations(the matrices must be either half precision or single " + "precision); cuDNN uses Tensor Cores to speed up both convolutions(the " + "input and output must be half precision) and recurrent neural networks " + "(RNNs)."); + namespace paddle { namespace platform { @@ -64,6 +83,16 @@ int GetCUDADriverVersion(int id) { return driver_version; } +bool TensorCoreAvailable() { +#if CUDA_VERSION >= 9000 + int device = GetCurrentDeviceId(); + int driver_version = GetCUDAComputeCapability(device); + return driver_version >= 70; +#else + return false; +#endif +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index be44158431ff80a41f7fdf4dfd4d070667f2ac63..6a0b3c8e02d49068c2dbe14c7feea7e139947694 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id); //! Get the driver version of the ith GPU int GetCUDADriverVersion(int id); +//! Wheter the current device support TensorCore +bool TensorCoreAvailable(); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 11c68f3449ee26b64c121acd081479b37c94fac4..ed8734c98cb996721a3002523c9276e6d7f492ae 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -14,11 +14,11 @@ #pragma once +#include #include #include #include #include -#include "ThreadPool.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb6ee2f4a53925f64b61b9fe03f5a4bd7203ed53..25d241d9768c16e1da304a78f259d5a626f702fc 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -5,8 +5,8 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + DEPS ARCHIVE_START ${PYBIND_DEPS} + ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END) else() cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 586e92c2b3146d75a673d1fe326dbee7297a3bfb..0443ff3fc3d055c54527240cdf57c195e1c0acf8 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -30,11 +30,12 @@ namespace pybind11 { namespace detail { // Can be replaced by a generic lambda in C++14 -struct variant_caster_visitor : public boost::static_visitor { +struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor + : public boost::static_visitor { return_value_policy policy; handle parent; - variant_caster_visitor(return_value_policy policy, handle parent) + paddle_variant_caster_visitor(return_value_policy policy, handle parent) : policy(policy), parent(parent) {} template @@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor { }; template -struct variant_caster; +struct paddle_variant_caster; template