diff --git a/CMakeLists.txt b/CMakeLists.txt index 66dcef0013efb486b532f9ae17e9ae2040dc9e38..d6aa8f1b85c9c4c1a9ccd5b7d5f5607f9db39bc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) +option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) @@ -261,6 +262,12 @@ if (WITH_PROFILER) add_definitions(-DWITH_GPERFTOOLS) endif() +if (WITH_JEMALLOC) + find_package(JeMalloc REQUIRED) + include_directories(${JEMALLOC_INCLUDE_DIR}) + add_definitions(-DWITH_JEMALLOC) +endif() + include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation @@ -290,7 +297,7 @@ if(WITH_PSLIB) list(APPEND EXTERNAL_LIBS pslib_brpc) list(APPEND EXTERNAL_LIBS libmct) endif(WITH_PSLIB) - + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/Dockerfile b/Dockerfile index 84e1edbee91b0c3877ec7c55161af8c3bdf1c0dc..acfd091265e26d6c29c561d166fed2504c0cff1c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.6 install -U wheel && \ - pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.7 install -U wheel && \ - pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 --no-cache-dir install -U wheel && \ + pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 --no-cache-dir install -U wheel && \ + pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 --no-cache-dir install -U wheel && \ + pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.6 install opencv-python && \ - pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.7 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python + pip --no-cache-dir install -U pip setuptools wheel && \ + pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 --no-cache-dir install opencv-python && \ + pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip --no-cache-dir install opencv-python #For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip3.6 install pylint pytest astroid isort -RUN pip3.7 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker +RUN pip3 --no-cache-dir install pylint pytest astroid isort +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort +RUN pip3.7 --no-cache-dir install pylint pytest astroid isort +RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip3.6 install -r /root/requirements.txt -RUN pip3.7 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt +RUN pip3 --no-cache-dir install -r /root/requirements.txt +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt +RUN pip3.7 --no-cache-dir install -r /root/requirements.txt +RUN pip --no-cache-dir install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip3.6 install certifi urllib3[secure] -RUN pip3.7 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] +RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y +RUN pip3 --no-cache-dir install certifi urllib3[secure] +RUN pip3.6 --no-cache-dir install certifi urllib3[secure] +RUN pip3.7 --no-cache-dir install certifi urllib3[secure] +RUN pip --no-cache-dir install certifi urllib3[secure] # Install woboq_codebrowser to /woboq @@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ -DCMAKE_BUILD_TYPE=Release . \ make) +# ar mishandles 4GB files +# https://sourceware.org/bugzilla/show_bug.cgi?id=14625 +# remove them when apt-get support 2.27 and higher version +RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \ + tar -xzf binutils_2.27.orig.tar.gz && \ + cd binutils-2.27 && \ + ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz + # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd RUN echo 'root:root' | chpasswd diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..b95287160ba610b2dfa93ba15e7c7c8214d80ac1 --- /dev/null +++ b/cmake/FindJeMalloc.cmake @@ -0,0 +1,28 @@ +# - Find JeMalloc library +# Find the native JeMalloc includes and library +# +# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc. +# JEMALLOC_LIBRARIES - List of libraries when using jemalloc. +# JEMALLOC_FOUND - True if jemalloc found. + +find_path(JEMALLOC_INCLUDE_DIR + NAMES jemalloc/jemalloc.h + HINTS ${JEMALLOC_ROOT_DIR}/include) + +find_library(JEMALLOC_LIBRARIES + NAMES jemalloc + HINTS ${JEMALLOC_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) + +mark_as_advanced( + JEMALLOC_LIBRARIES + JEMALLOC_INCLUDE_DIR) + +if (JEMALLOC_FOUND) + add_library(jemalloc::jemalloc UNKNOWN IMPORTED) + set_target_properties(jemalloc::jemalloc PROPERTIES + IMPORTED_LOCATION ${JEMALLOC_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}") +endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8..e3d856fb30d8103f50ebcb6dc16153c8ed2a97a6 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -134,6 +134,7 @@ if(WITH_GPU) message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF") set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) endif() + add_definitions(-DWITH_ANAKIN) endif() if(WITH_ANAKIN) # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 414e92eb27f56e0670e1977e67c2f5ca9c6bbcc2..16432ce2b803f6d21bbf47200eda5404269b750f 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -5,6 +5,8 @@ endif() set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs8 "30 35 50 52 60 61") +set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") ###################################################################################### # A function for automatic detection of GPUs installed (if autodetection is enabled) @@ -59,7 +61,7 @@ endfunction() # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") set(archs_name_default "All") if(NOT CMAKE_CROSSCOMPILING) list(APPEND archs_names "Auto") @@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin "60 61") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") + set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") @@ -139,10 +143,12 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) + add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -150,6 +156,17 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") + add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") +elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"90\"") +elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"100\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index fb899e3d7cd4224acd25a559d0e18a09f552ad7d..fff1980637d029b8a392c166734d3c3b84fed867 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,6 +89,7 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() + add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index 5a78a1d1b7dea0d95ae3fa2c9f39679899dd1bcb..12412a51a0fd1aaa9702bd4547fb935d94012ada 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost") # checked that the devtools package of CentOS 6 installs boost 1.41.0. # So we use 1.41.0 here. set(BOOST_VER "1.41.0") -if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) - message(STATUS "use pre defined download url") - set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) - set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) -endif() +set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) +set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index c94849cf4b96746e6c507db2a6310c2f305dacf5..f06728de91e4509be661e56baef641d591928b66 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -32,4 +32,4 @@ endif() add_dependencies(cub extern_cub) -LIST(APPEND externl_project_dependencies cub) +LIST(APPEND external_project_dependencies cub) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 94d8fcc66855627d665b8e84a47a2075e7253b03..4587475d7902a134eecd54bf8241fb96d175d0ba 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -28,4 +28,4 @@ endif() add_dependencies(dlpack extern_dlpack) -LIST(APPEND externl_project_dependencies dlpack) +LIST(APPEND external_project_dependencies dlpack) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index c29375cd0589764507f9a10859a6d4fdbb29716e..03f0dee85911bdaa0312b624114b7f4aef1fb723 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -55,7 +55,7 @@ ExternalProject_Add( ${MKLDNN_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} - GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" + GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" @@ -106,10 +106,10 @@ else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn) + DEPENDS mkldnn shared_mkldnn) endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) - +ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) IF(WITH_C_API) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) ENDIF() diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index d49839a89d78803f0fad58192283deae47ad72ef..43322a257a02c3fd756078db6fe20b582826066a 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -17,10 +17,8 @@ IF(NOT ${WITH_MKLML}) ENDIF(NOT ${WITH_MKLML}) IF(APPLE) - MESSAGE(WARNING - "Mac is not supported with MKLML in Paddle yet." - "Force WITH_MKLML=OFF") - SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) + MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) return() ENDIF() @@ -31,29 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) -if(WIN32) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") + +SET(TIME_VERSION "2019.0.1.20181227") +IF(WIN32) + SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) -else() +ELSE() + SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) -endif() -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") - -IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) - MESSAGE(STATUS "use pre defined download url") - if(WIN32) - SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE) - SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) - else() - SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) - ENDIF() -endif() +ENDIF() SET(MKLML_PROJECT "extern_mklml") MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index e66459fa3a1508fe4a3687f07bbe18f2a5421296..799d9c309f329f5f10364d794a7964ec3b02eeb4 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_VERSION "0.9") -SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") +SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) -SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) +SET(NGRAPH_SHARED_LIB_NAME libngraph.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c6fe2e970d3e02985e3f2b8d5df6a7358beed514..05293b8b06b55bb0b83a30c7eb059efe0b61e57e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -115,6 +115,10 @@ function(common_link TARGET_NAME) if (WITH_PROFILER) target_link_libraries(${TARGET_NAME} gperftools::profiler) endif() + + if (WITH_JEMALLOC) + target_link_libraries(${TARGET_NAME} jemalloc::jemalloc) + endif() endfunction() @@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME) # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() - # msvc will put libarary in directory of "/Release/xxxlib" by default + # msvc will put libarary in directory of "/Release/xxxlib" by default # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 48279bc809dde9e97c967c3ea5d03fbd7b89b017..3e11d332ff71098adf65e487a39351ae57427e9e 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -136,7 +136,7 @@ if (WITH_MKLDNN) copy(mkldnn_lib SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} DSTS ${dst_dir} ${dst_dir}/lib - DEPS mkldnn + DEPS mkldnn_shared_lib ) endif () diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 70d159b4f3549662e080794efad8af943ce1f0bc..59c40a0e5d18b753038f2b9301d1c9494e3901be 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -110,7 +110,7 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op") +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4feaace040da416a01872882456c4098fc..566dc75fda019eb66759eb403f60e16f18cffef1 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 8d95dc0591e1d6bd815cc697528191c2ee8c1cfe..500f64bed9898fa874cbad2ea69aae05df58023e 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -60,7 +60,7 @@ class Float16Transpiler: raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") self.scope = scope diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b6974c6af290438f827c16bb478eb43e3cf42247..9872631553056fbabd8be8162ecee16250b33862 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) -paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope -paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 867970717b38653252dae014a2ec7c0af2daeff8..a167511160d074c13ca1dca36b4f2c5eeea4bb93 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,27 +7,17 @@ function(windows_symbolic TARGET) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - -#only copy the xx.cu to.xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() @@ -37,9 +27,10 @@ add_subdirectory(details) proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) -cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) +cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) @@ -78,17 +69,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) -cc_test(variable_test SRCS variable_test.cc) - cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +if (WITH_GPU) + target_link_libraries(var_type_traits dynload_cuda) +endif() +cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) + +cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) +cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope) +cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function) + DEPS operator op_registry device_context math_function scope) if(WITH_GPU) if (WIN32) @@ -133,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) if(WITH_NGRAPH) - if(NOT WIN32) - cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) - cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler ngraph) - endif(NOT WIN32) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler) endif(WITH_NGRAPH) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) @@ -179,11 +174,7 @@ if(WITH_DISTRIBUTE) else() if(WITH_NGRAPH) - if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper) - else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) else(WITH_NGRAPH) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) endif(WITH_NGRAPH) @@ -193,14 +184,14 @@ endif() target_link_libraries(executor garbage_collector) cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) if(WITH_PSLIB) - cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer) else() - cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer) endif(WITH_PSLIB) diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h index be9efcd74924a2050a2fd9ab83059590a1a2a2fd..b53082986882c80a85826f10d5766525f72c0a97 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/fluid/framework/array.h @@ -15,34 +15,123 @@ #pragma once #include -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { + template class Array { - static_assert(N > 0, "The size of array must be larger than 0"); - public: - HOSTDEVICE Array() {} + static constexpr size_t kSize = N; + + HOSTDEVICE inline Array() {} - HOSTDEVICE explicit Array(const T &val) { - for (size_t i = 0; i < N; ++i) data_[i] = val; + template + HOSTDEVICE inline explicit Array(const T &val, Args... args) { + static_assert(N == sizeof...(Args) + 1, "Invalid argument"); + UnrollVarArgsAssign::Run(data_, val, args...); } - HOSTDEVICE const T *Get() const { return data_; } + HOSTDEVICE inline void Fill(const T &val) { + UnrollFillConstant::Run(data_, val); + } - HOSTDEVICE T *GetMutable() { return data_; } + HOSTDEVICE inline const T *Get() const { return data_; } - HOSTDEVICE T &operator[](size_t index) { return data_[index]; } + HOSTDEVICE inline T *GetMutable() { return data_; } - HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } + HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); } + + // Writing "return data_[i]" would cause compilation warning/error: + // "array subscript is above array bound" in Python 35 CI. + // It seems that it is a false warning of GCC if we do not check the bounds + // of array index. But for better performance, we do not check in operator[] + // like what is in STL. If users want to check the bounds, use at() instead + HOSTDEVICE inline const T &operator[](size_t i) const { + return *advance(data_, i); + } + + HOSTDEVICE inline T &at(size_t i) { +#ifndef __CUDA_ARCH__ + PADDLE_ENFORCE_LT(i, N, "Array index out of bounds"); +#endif + return (*this)[i]; + } + + HOSTDEVICE inline const T &at(size_t i) const { +#ifndef __CUDA_ARCH__ + PADDLE_ENFORCE_LT(i, N, "Array index out of bounds"); +#endif + return (*this)[i]; + } HOSTDEVICE constexpr size_t size() const { return N; } + HOSTDEVICE inline bool operator==(const Array &other) const { + return UnrollCompare::Run(data_, other.data_); + } + + HOSTDEVICE inline bool operator!=(const Array &other) const { + return !(*this == other); + } + private: + template + HOSTDEVICE static inline U *advance(U *ptr, size_t i) { + return ptr + i; + } + T data_[N]; }; +template +class Array { + public: + static constexpr size_t kSize = 0; + + HOSTDEVICE inline Array() {} + + HOSTDEVICE inline void Fill(const T &val) {} + + HOSTDEVICE inline constexpr T *Get() const { return nullptr; } + + // Add constexpr to GetMutable() cause warning in MAC + HOSTDEVICE inline T *GetMutable() { return nullptr; } + + HOSTDEVICE inline T &operator[](size_t) { +#ifdef __CUDA_ARCH__ + static T obj(); + return obj; +#else + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline const T &operator[](size_t) const { +#ifdef __CUDA_ARCH__ + static const T obj(); + return obj; +#else + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; } + + HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; } + + HOSTDEVICE constexpr size_t size() const { return 0; } + + HOSTDEVICE constexpr bool operator==(const Array &other) const { + return true; + } + + HOSTDEVICE constexpr bool operator!=(const Array &other) const { + return false; + } +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index ee3c5e01f87eeb123f43f867296e35cc8adb7e8e..1d9678a1ba1409e5c18d3e25b3aa13dfbbf76908 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { - threads.push_back( - std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + if (debug) { + threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer, + workers[thidx].get())); + } else { + threads.push_back( + std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + } } for (auto& th : threads) { diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index c9ec5e7a7b37b62efbf3d980e93b5518364d99c9..96a2f9250ff928fe58a5339a25c68c9db515522d 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 05e423b8a52962d47a6615d48243444374b470e3..e7a6df57e538164969bc101ced4b91de8f75ca56 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -18,312 +18,159 @@ limitations under the License. */ namespace paddle { namespace framework { -/// @cond HIDDEN - -template -Dim make_dim(const int64_t* d) { - return Dim(*d, make_dim(d + 1)); -} - -template <> -Dim<0> make_dim<0>(const int64_t* d) { - return Dim<0>(*d); -} - -void make_ddim(DDim& ddim, const int64_t* dims, int n) { - switch (n) { - case 0: - ddim = make_dim<0>(dims); - break; - case 1: - ddim = make_dim<1>(dims); - break; - case 2: - ddim = make_dim<2>(dims); - break; - case 3: - ddim = make_dim<3>(dims); - break; - case 4: - ddim = make_dim<4>(dims); - break; - case 5: - ddim = make_dim<5>(dims); - break; - case 6: - ddim = make_dim<6>(dims); - break; - case 7: - ddim = make_dim<7>(dims); - break; - case 8: - ddim = make_dim<8>(dims); - break; - case 9: - ddim = make_dim<9>(dims); - break; - default: - PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); - } -} - -/// @endcond - DDim make_ddim(std::initializer_list dims) { - DDim result(make_dim(0)); - make_ddim(result, dims.begin(), dims.size()); - return result; + return DDim(dims.begin(), dims.size()); } DDim make_ddim(const std::vector& dims) { - DDim result(make_dim(0)); - make_ddim(result, &dims[0], dims.size()); - return result; + return DDim(dims.data(), dims.size()); } DDim make_ddim(const std::vector& dims) { - std::vector res(dims.size()); - std::transform(dims.begin(), dims.end(), res.begin(), - [](int d) { return static_cast(d); }); - return make_ddim(res); + return DDim(dims.data(), dims.size()); } -/// @cond HIDDEN -// XXX For some reason, putting this in an anonymous namespace causes errors -class DynamicMutableIndexer : public boost::static_visitor { - public: - explicit DynamicMutableIndexer(int idx) : idx_(idx) {} +struct DDimEqualityVisitor { + explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {} template - int64_t& operator()(Dim& dim) const { - return dim[idx_]; + inline bool operator()(const Dim& self) const { + return UnrollCompare::Run(self.Get(), d_); } - private: - int idx_; + const int64_t* d_; }; -class DynamicConstIndexer : public boost::static_visitor { - public: - explicit DynamicConstIndexer(int idx) : idx_(idx) {} - - template - int64_t operator()(const Dim& dim) const { - return dim[idx_]; - } - - private: - int idx_; -}; - -/// @endcond - -int64_t& DDim::operator[](int idx) { - return boost::apply_visitor(DynamicMutableIndexer(idx), var); +bool DDim::operator==(const DDim& d) const { + return size() == d.size() && + this->apply_visitor(DDimEqualityVisitor(d.Get())); } -int64_t DDim::operator[](int idx) const { - return boost::apply_visitor(DynamicConstIndexer(idx), var); -} +bool DDim::operator!=(const DDim& d) const { return !(*this == d); } -int DDim::size() const { return arity(*this); } +struct DDimPlusVisitor { + explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} -bool DDim::operator==(DDim d) const { - if (var.which() != d.getVar().which()) { - return false; - } else { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - - for (unsigned int i = 0; i < v1.size(); i++) { - if (v1[i] != v2[i]) { - return false; - } - } - - return true; + template + inline void operator()(Dim& self) const { + UnrollAdd::Run(d1_, d2_, self.GetMutable()); } -} - -bool DDim::operator!=(DDim d) const { return !(*this == d); } - -DDim DDim::operator+(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - - std::vector v3; - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] + v2[i]); - } + const int64_t* d1_; + const int64_t* d2_; +}; - return make_ddim(v3); +DDim DDim::operator+(const DDim& d) const { + PADDLE_ENFORCE(size() == d.size()); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimPlusVisitor(Get(), d.Get())); + return ret; } -DDim DDim::operator*(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +struct DDimMulVisitor { + explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] * v2[i]); + template + inline void operator()(Dim& self) const { + UnrollMul::Run(d1_, d2_, self.GetMutable()); } - return make_ddim(v3); + const int64_t* d1_; + const int64_t* d2_; +}; + +DDim DDim::operator*(const DDim& d) const { + PADDLE_ENFORCE(size() == d.size()); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimMulVisitor(Get(), d.Get())); + return ret; } int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } -void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } - -/// @cond HIDDEN -struct VectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - - explicit VectorizeVisitor(std::vector& v) : vector(v) {} - - template - void operator()(const T& t) { - vector.push_back(t.head); - this->operator()(t.tail); - } - - void operator()(const Dim<0>& t) {} -}; -/// @endcond +void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT std::vector vectorize(const DDim& ddim) { - std::vector result; - VectorizeVisitor visitor(result); - boost::apply_visitor(visitor, ddim); + std::vector result(DDim::kMaxRank); + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); + result.resize(ddim.size()); return result; } // NOTE: framework::vectorize converts to type int64_t // which does not fit cudnn inputs. std::vector vectorize2int(const DDim& ddim) { - std::vector temp = vectorize(ddim); - std::vector result(temp.begin(), temp.end()); + std::vector result(DDim::kMaxRank); + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); + result.resize(ddim.size()); return result; } -struct ProductVisitor : public boost::static_visitor { +struct ProductVisitor { template - int64_t operator()(const Dim& dim) { + inline int64_t operator()(const Dim& dim) { return product(dim); } }; int64_t product(const DDim& ddim) { - ProductVisitor visitor; - return boost::apply_visitor(visitor, ddim); + return ddim.apply_visitor(ProductVisitor()); } -struct SliceVectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - int begin; - int end; - - SliceVectorizeVisitor(std::vector& v, int b, int e) - : vector(v), begin(b), end(e) { - PADDLE_ENFORCE(begin < end, - "Begin index must be less than end index in ddim slice."); - PADDLE_ENFORCE(begin >= 0, - "Begin index can't be less than zero in ddim slice."); - } - - template - void operator()(const Dim& dim) { - if (begin == 0) { - vector.push_back(dim.head); - } else { - --begin; - } - --end; - if (end > 0) { - this->operator()(dim.tail); - } - } - - void operator()(const Dim<0>& dim) { - PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound."); - } -}; - DDim slice_ddim(const DDim& dim, int begin, int end) { - std::vector vec; - vec.reserve(end - begin); - SliceVectorizeVisitor visitor(vec, begin, end); - boost::apply_visitor(visitor, dim); - return make_ddim(vec); + PADDLE_ENFORCE(begin >= 0 && end <= dim.size(), + "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", + begin, end, dim.size()); + // Constructor of DDim would check whether end - begin is valid + return DDim(dim.Get() + begin, end - begin); } -/// \cond HIDDEN - -struct ArityVisitor : boost::static_visitor { - template - int operator()(Dim) const { - return D; - } -}; - -/// \endcond - -int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } +int arity(const DDim& d) { return d.size(); } -/// \cond HIDDEN - -struct DDimPrinter : boost::static_visitor { +struct DDimPrinter { std::ostream& os; explicit DDimPrinter(std::ostream& os_) : os(os_) {} - template - void operator()(const T& t) { + template + void operator()(const Dim& t) { os << t; } }; -/// \endcond - std::ostream& operator<<(std::ostream& os, const DDim& ddim) { - DDimPrinter printer(os); - boost::apply_visitor(printer, ddim); + ddim.apply_visitor(DDimPrinter(os)); return os; } -DDim::DDim(std::initializer_list init_list) { - *this = make_ddim(init_list); -} - DDim flatten_to_2d(const DDim& src, int num_col_dims) { - int rank = src.size(); - return make_ddim({product(slice_ddim(src, 0, num_col_dims)), - product(slice_ddim(src, num_col_dims, rank))}); + return DDim({product(slice_ddim(src, 0, num_col_dims)), + product(slice_ddim(src, num_col_dims, src.size()))}); } -DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } +DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); } DDim stride(const DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; } - return framework::make_ddim(strides); + return strides; } -DDim stride_numel(const framework::DDim& ddim) { - std::vector strides(ddim.size()); +DDim stride_numel(const DDim& ddim) { + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; } - return framework::make_ddim(strides); + return strides; } } // namespace framework diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index f05b5ee3faee856a41f1376e5952710b550e7c42..31a41dab2a1f1d6bad9fe697c5d367f32e219160 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -18,62 +18,145 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/dim.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { +#define PADDLE_VISIT_DDIM_BASE(rank, callback) \ + case (rank): { \ + constexpr auto kRank = (rank); \ + return (callback); \ + } + +#define PADDLE_VISIT_DDIM(rank, callback) \ + switch (rank) { \ + PADDLE_VISIT_DDIM_BASE(0, callback); \ + PADDLE_VISIT_DDIM_BASE(1, callback); \ + PADDLE_VISIT_DDIM_BASE(2, callback); \ + PADDLE_VISIT_DDIM_BASE(3, callback); \ + PADDLE_VISIT_DDIM_BASE(4, callback); \ + PADDLE_VISIT_DDIM_BASE(5, callback); \ + PADDLE_VISIT_DDIM_BASE(6, callback); \ + PADDLE_VISIT_DDIM_BASE(7, callback); \ + PADDLE_VISIT_DDIM_BASE(8, callback); \ + PADDLE_VISIT_DDIM_BASE(9, callback); \ + default: \ + PADDLE_THROW("Invalid rank %d", rank); \ + } + +template +inline void dynamic_dim_assign(const T1* in, T2* out, int n) { + PADDLE_VISIT_DDIM(n, (static_dim_assign(in, out))); +} + /** * \brief A dynamically sized dimension. * * The number of dimensions must be between [1, 9]. */ -struct DDim { - typedef boost::variant, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, - Dim<7>, Dim<8>, Dim<9>> - DDimVar; - DDimVar var; +class DDim { + public: + constexpr static int kMaxRank = 9; + + DDim() : rank_(1) { dim_[0] = 0; } - DDim() : var(Dim<1>()) {} + DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); } + + DDim(const int* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } + + DDim(const int64_t* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } template - explicit DDim(const Dim& in) : var(in) {} + /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT + UnsafeCast() = in; + } + + /*implicit*/ DDim(std::initializer_list init_list) + : DDim(init_list.begin(), init_list.size()) {} - /*implicit*/ DDim(std::initializer_list init_list); + inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); } template - DDim& operator=(const Dim& in) { - var = in; + inline DDim& operator=(const Dim& dim) { + rank_ = D; + UnsafeCast() = dim; return *this; } - int64_t& operator[](int idx); - int64_t operator[](int idx) const; + inline int64_t& operator[](int idx) { return dim_[idx]; } + + inline int64_t operator[](int idx) const { return dim_[idx]; } + + inline int64_t& at(int idx) { + PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx); + return dim_[idx]; + } + + inline int64_t at(int idx) const { + PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx); + return dim_[idx]; + } template - typename Visitor::result_type apply_visitor(Visitor& visitor) { - return var.apply_visitor(visitor); + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); } template - typename Visitor::result_type apply_visitor(Visitor& visitor) const { - return var.apply_visitor(visitor); + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) const { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); } - DDimVar getVar() { return var; } + bool operator==(const DDim& d) const; + + bool operator!=(const DDim& d) const; + + DDim operator+(const DDim& d) const; - bool operator==(DDim d) const; + DDim operator*(const DDim& d) const; - bool operator!=(DDim d) const; + inline const int64_t* Get() const { return dim_.Get(); } - DDim operator+(DDim d) const; + inline int64_t* GetMutable() { return dim_.GetMutable(); } - DDim operator*(DDim d) const; + inline int size() const { return rank_; } + + private: + template + inline Dim& UnsafeCast() { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + template + inline const Dim& UnsafeCast() const { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } - int size() const; + inline DDim& CopyFrom(const DDim& ddim) { + PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast())); + } + + friend DDim stride(const DDim& ddim); + friend DDim stride_numel(const DDim& ddim); + + private: + Dim dim_; + int rank_; }; +#undef PADDLE_VISIT_DDIM_BASE +#undef PADDLE_VISIT_DDIM + /** * \brief Make a DDim from std::vector * @@ -92,7 +175,7 @@ DDim make_ddim(const std::vector& dims); DDim make_ddim(std::initializer_list dims); int64_t get(const DDim& dim, int idx); -void set(DDim& dim, int idx, int val); +void set(DDim& dim, int idx, int val); // NOLINT std::vector vectorize(const DDim& ddim); std::vector vectorize2int(const DDim& ddim); @@ -129,12 +212,3 @@ DDim stride(const DDim& ddim); DDim stride_numel(const DDim& ddim); } // namespace framework } // namespace paddle - -namespace boost { - -template -T get(const paddle::framework::DDim& in) { - return boost::get(in.var); -} - -} // namespace boost diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 63a68ba3a5c289be7c2d352717fe5911539df8a7..c1ba6606f1064750a9d7e087ded1ec3634bcc4a5 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory @@ -92,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - memory_optimize_pass) + memory_optimize_pass lock_free_optimize_pass) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 9eaff1f560147ad053ac599cf141be8a66a5c353..a24e3d3e487e488f0d0c59809a0adc9f9524cc6e 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -19,6 +19,13 @@ #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" +// asynchronous nccl allreduce or synchronous issue: +// https://github.com/PaddlePaddle/Paddle/issues/15049 +DEFINE_bool( + sync_nccl_allreduce, false, + "If set true, will call `cudaStreamSynchronize(nccl_stream)`" + "after allreduce, this mode can get better performance in some scenarios."); + namespace paddle { namespace framework { namespace details { @@ -48,100 +55,104 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); -// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, -// this is a distributed or inter-process call, find a better way. -#ifdef PADDLE_WITH_CUDA - if (NoDummyInputSize() == 1 && - local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { -#else - if (NoDummyInputSize() == 1) { -#endif - return; // No need to all reduce when GPU count = 1; - } else { - // Wait input done - WaitInputVarGenerated(); - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), places_.size(), - "The NoDummyInputSize should be equal to the number of places."); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); - - std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto *s = local_scopes_[i]; - auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - auto &lod_tensor = - local_scope.FindVar(in_var_handles[i]->name_)->Get(); - lod_tensors.emplace_back(&lod_tensor); - PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, - "The name of input and output should be equal."); - } + WaitInputVarGenerated(); + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector lod_tensors; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto *s = local_scopes_[i]; + auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); + lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); + } - if (platform::is_gpu_place(lod_tensors[0]->place())) { + if (platform::is_gpu_place(lod_tensors[0]->place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); - int dtype = -1; - size_t numel = 0; - std::vector> all_reduce_calls; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - auto &lod_tensor = *lod_tensors[i]; - void *buffer = const_cast(lod_tensor.data()); - - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); - } + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int dtype = -1; + size_t numel = 0; + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto &lod_tensor = *lod_tensors[i]; + void *buffer = const_cast(lod_tensor.data()); - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); - } + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } - int dev_id = boost::get(p).device; - auto &nccl_ctx = nccl_ctxs_->at(dev_id); - auto stream = nccl_ctx.stream(); - auto comm = nccl_ctx.comm_; - all_reduce_calls.emplace_back([=] { - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), - ncclSum, comm, stream)); - }); + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); } - this->RunAndRecordEvent([&] { + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + comm, stream)); + }); + } + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { platform::NCCLGroupGuard guard; for (auto &call : all_reduce_calls) { call(); } - }); + } + }); + + if (FLAGS_sync_nccl_allreduce) { + for (auto &p : places_) { + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + cudaStreamSynchronize(stream); + } + } + #else - PADDLE_THROW("Not compiled with CUDA"); + PADDLE_THROW("Not compiled with CUDA"); #endif - } else { // Special handle CPU only Operator's gradient. Like CRF - auto &trg = *this->local_scopes_[0] - ->FindVar(kLocalExecScopeName) - ->Get() - ->FindVar(out_var_handles[0]->name_) - ->GetMutable(); - - // Reduce All Tensor to trg in CPU - ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(lod_tensors[0]->type(), func); - - for (size_t i = 1; i < local_scopes_.size(); ++i) { - auto &scope = - *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); - auto &p = places_[i]; - auto *var = scope.FindVar(out_var_handles[i]->name_); - auto *dev_ctx = dev_ctxes_.at(p); - - RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { - auto &tensor_gpu = *var->GetMutable(); - auto &tensor_cpu = trg; - TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); - }); - } + } else { // Special handle CPU only Operator's gradient. Like CRF + auto &trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(out_var_handles[0]->name_) + ->GetMutable(); + + // Reduce All Tensor to trg in CPU + ReduceLoDTensor func(lod_tensors, &trg); + VisitDataType(lod_tensors[0]->type(), func); + + for (size_t i = 1; i < local_scopes_.size(); ++i) { + auto &scope = + *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); + auto &p = places_[i]; + auto *var = scope.FindVar(out_var_handles[i]->name_); + auto *dev_ctx = dev_ctxes_.at(p); + + RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { + auto &tensor_gpu = *var->GetMutable(); + auto &tensor_cpu = trg; + TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); + }); } } } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 389366a8a98c5753268718c49c62c2dffe99c32f..df0ff772c9d35c88ec5a6112525c56aa92d359b9 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/details/memory_reuse_types.h" -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" +#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" @@ -31,7 +31,11 @@ namespace framework { namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { - return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); + // Should fix the allreduce op order if scheduling + // them in multiple threads or processes to avoid hang. + return (!strategy.enable_sequential_execution_ && + strategy.num_trainers_ > 1) || + strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { @@ -67,7 +71,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { context->endpoints_ = strategy_.trainers_endpoints_; context->trainer_id_ = strategy_.trainer_id_; PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); - if (strategy_.trainer_id_ > 0) { + if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) { PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < strategy_.trainers_endpoints_.size(), "trainer_id_ < endpoints_ size"); @@ -82,12 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (strategy.memory_optimize_) { auto analysis_var_pass = AppendPass("analysis_var_pass"); } - // Convert graph to run on multi-devices. - auto multi_devices_pass = AppendPass("multi_devices_pass"); - multi_devices_pass->SetNotOwned("strategy", - &strategy_); - multi_devices_pass->Set("num_trainers", - new int(strategy_.num_trainers_)); + + AppendMultiDevPass(strategy); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { @@ -113,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } + // Convert graph to run on multi-devices. + void AppendMultiDevPass(const BuildStrategy &strategy) { + ir::Pass *multi_devices_pass; + if (strategy_.is_distribution_) { + multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); + } else { + if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + multi_devices_pass = + AppendPass("allreduce_mode_multi_devices_pass").get(); + } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); + } else { + PADDLE_THROW("Unknown reduce strategy."); + } + } + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + } + private: BuildStrategy strategy_; }; @@ -129,9 +148,14 @@ std::shared_ptr BuildStrategy::CreatePassesFromStrategy( return pass_builder_; } +bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { + return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0; +} + std::unique_ptr BuildStrategy::Apply( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else @@ -142,19 +166,23 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - if (pass->Type() == "multi_devices_pass") { - pass->Erase("places"); - pass->SetNotOwned>("places", &places); - pass->Erase("loss_var_name"); - pass->SetNotOwned("loss_var_name", &loss_var_name); - pass->Erase("local_scopes"); - pass->SetNotOwned>("local_scopes", + if (IsMultiDevPass(pass->Type())) { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLossVarName); + pass->SetNotOwned(kLossVarName, &loss_var_name); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, &local_scopes); + pass->Erase(kNRanks); + pass->Set(kNRanks, new size_t(nranks)); + #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { const std::vector *all_op_descs = new std::vector(main_program.Block(0).AllOps()); @@ -195,10 +223,13 @@ std::unique_ptr BuildStrategy::Apply( USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); -USE_PASS(multi_devices_pass); +USE_PASS(reduce_mode_multi_devices_pass); +USE_PASS(allreduce_mode_multi_devices_pass); +USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); +USE_PASS(lock_free_optimize_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 11db184cb4efe349a340aceb4b7e1e3f4d4b24a5..15c2e01b6142571883c759efb1e26b609be9adb4 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,8 +74,6 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; - bool enable_data_balance_{false}; - bool memory_optimize_{false}; bool memory_early_delete_{false}; @@ -84,6 +82,10 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + // num_trainers is 1, so the current fields of build_strategy doesn't tell if + // it's distributed model. + bool is_distribution_{false}; int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; @@ -104,12 +106,15 @@ struct BuildStrategy { bool IsFinalized() const { return is_finalized_; } + bool IsMultiDevPass(const std::string &pass_name) const; + // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. std::unique_ptr Apply(const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; @@ -117,6 +122,13 @@ struct BuildStrategy { const bool use_cuda) const; #endif + // If set true, ParallelExecutor would build the main_program into multiple + // graphs, + // each of the graphs would run with one device. This approach can achieve + // better performance + // on some scenarios. + mutable bool enable_parallel_graph_ = false; + private: mutable bool is_finalized_ = false; mutable std::shared_ptr pass_builder_; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index abacb11e3b018308c20a67630e3ff34cca7d3387..03fbfd7f24a8a987db72f45be777acc7ece577a6 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() { } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c2b6c7643ff96661be09e5ac4870344..37b07e5736312b3050debe745f2d3c108469c5d6 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e03705a4a0900f1c7b8758a8f7308aa44b..872bc5d654cd66db821e56031d878815b653645c 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc index c8ea18804630fea4ada98062256730dbf4c24860..a4bb1e26d933946b7ca36196d1c0e8a0a4ec54e2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -21,68 +21,78 @@ namespace paddle { namespace framework { namespace details { -bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { - std::unordered_map pending_ops; - std::unordered_set pending_vars; - std::unordered_set ready_vars; - std::unordered_set ready_ops; +class SSAGraghBuilderWithChecker : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } - auto insert_pending_var = [&](VarHandleBase *var) { - pending_vars.insert(var); - if (var->GeneratedOp() == nullptr) { - ready_vars.emplace(var); - } - }; + bool IsValidGraph(const ir::Graph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; - for (auto &var_map : graph->Get(kGraphVars)) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - insert_pending_var(version_pair); + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->GeneratedOp() == nullptr) { + ready_vars.emplace(var); } - } - } + }; - for (auto &var : graph->Get(kGraphDepVars)) { - insert_pending_var(var); - } + for (auto &var_map : graph->Get(kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair); + } + } + } - for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { - if (op->Inputs().empty()) { - ready_ops.insert(op); - } else { - pending_ops.insert({op, op->NoDupInputSize()}); + for (auto &var : graph->Get(kGraphDepVars)) { + insert_pending_var(var); } - } - auto run_all_ops = [&](std::unordered_set &set) { - for (auto *op : set) { - for (auto out : op->Outputs()) { - ready_vars.emplace(out); + for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { + if (op->Inputs().empty()) { + ready_ops.insert(op); + } else { + pending_ops.insert({op, op->NoDupInputSize()}); } } - set.clear(); - }; - while (!pending_vars.empty()) { - run_all_ops(ready_ops); + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; - if (ready_vars.empty()) { - return false; - } + while (!pending_vars.empty()) { + run_all_ops(ready_ops); - for (auto ready_var : ready_vars) { - pending_vars.erase(ready_var); - for (auto *op : ready_var->PendingOps()) { - auto &deps = --pending_ops[op]; - if (deps == 0) { - ready_ops.insert(op); + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } } } + ready_vars.clear(); } - ready_vars.clear(); + return true; } - return true; -} +}; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7e320a08942e4a9a27e6b5c892a993b3a90c43a4..d91993bd4f8c04539cd189a4145350498911c513 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -42,6 +42,12 @@ namespace { typedef std::vector GraphOps; const char kGraphOps[] = "ops"; +bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) { + return boost::get( + node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(role); +} + void PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { @@ -128,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) { } } // namespace -static const char kLossVarName[] = "loss_var_name"; -static const char kPlaces[] = "places"; -static const char kLocalScopes[] = "local_scopes"; -static const char kStrategy[] = "strategy"; -static const char kNumTrainers[] = "num_trainers"; - -void MultiDevSSAGraphBuilder::Init() const { +void MultiDevSSAGraphBuilderBase::Init() const { all_vars_.clear(); - balance_vars_.clear(); loss_var_name_ = Get(kLossVarName); places_ = Get>(kPlaces); @@ -145,320 +144,82 @@ void MultiDevSSAGraphBuilder::Init() const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_ = &Get("nccl_ctxs"); #endif - - balance_vars_.resize(places_.size(), 0); - if (strategy_.enable_data_balance_ && places_.size() == 1) { - LOG(WARNING) << "It is no need to enable data balance when there is only " - "one place. enable_data_balance is set to False."; - strategy_.enable_data_balance_ = false; - } -} - -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { - auto p = places_[place_id]; - auto *op_handle = result->Get(kGraphOps).back(); - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); - - for (ir::Node *input : node->inputs) { - VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); - op_handle->AddInput(var); - } - - for (ir::Node *output : node->outputs) { - ir::Node *new_node = nullptr; - if (output->Var()) { - new_node = result->CreateVarNode(output->Var()); - } else { - new_node = - result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); - } - CreateOpOutput(result, op_handle, new_node, p, place_id); - } -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const std::vector &nodes) const { - std::vector send_vars; - // since parameters are all in block 0, - // it's enough to only scan send ops in block 0 - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find send op, - // instead of the the hard code string - if (op->Type() == "send") { - auto op_vars = op->InputArgumentNames(); - send_vars.reserve(send_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return send_vars; -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const std::vector &nodes) const { - std::vector recv_vars; - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find recv op, - // instead of the hard code string - if (op->Type() == "recv") { - auto op_vars = op->OutputArgumentNames(); - recv_vars.reserve(recv_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return recv_vars; -} - -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -// Topology sort the graph nodes from inputs to outputs. -// Since SSAGraphBuilder depends on forward/backward nodes to assign devices -// to parameter/gradients before optimizer ops, topo sort is insufficient. ( -// some optimizer ops might not depend on any nodes), we manually move all -// optimizer nodes after last backward nodes. -// However, the assumption by SSAGraphBuilder should be relaxed in the future. -std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { - std::vector ret = ir::TopologySortOperations(graph); - size_t last_backward = 0; - for (size_t i = 0; i < ret.size(); ++i) { - if (boost::get( - ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = i; - } - } - - std::vector optimize_ops; - std::vector sorted_ret; - for (size_t i = 0; i < ret.size(); ++i) { - if (i < last_backward) { - if (static_cast(boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kOptimize))) { - optimize_ops.push_back(ret[i]); - } else { - sorted_ret.push_back(ret[i]); - } - } else if (i == last_backward) { - sorted_ret.push_back(ret[i]); - // Verify that no operations before optimize ops depends on optimize ops. - std::unordered_set optimize_set(optimize_ops.begin(), - optimize_ops.end()); - for (ir::Node *n : sorted_ret) { - for (ir::Node *in : n->inputs) { - for (ir::Node *pre_n : in->inputs) { - PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), - "optimize operations cannot be depended by forward " - "or backward node %s -> %s", - pre_n->Name(), n->Name()); - } - } - } - sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(), - optimize_ops.end()); - } else { - sorted_ret.push_back(ret[i]); - } - } - return sorted_ret; } -std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( +std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( std::unique_ptr graph) const { Init(); - // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); + std::vector sorted_ops = SortOperations(*graph); + auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - int num_trainers = Get(kNumTrainers); - for (auto &node : nodes) { if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); } } - std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 result.Set(kGraphVars, new GraphVars(places_.size())); result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - // find send/recv vars so that we can place the distributed training - // related op in the place 0 - auto send_vars = FindDistTrainSendVars(sorted_ops); - auto recv_vars = FindDistTrainRecvVars(sorted_ops); - - std::vector> bcast_var_name_set; - bcast_var_name_set.resize(places_.size()); - - size_t cur_device_id = 0; bool is_forwarding = true; - bool is_dist_train = false; - - std::unordered_map sharded_var_device; + bool insert_collection_ops = NeedCollectiveOps(); for (ir::Node *node : sorted_ops) { - if (boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kRPC)) { - int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); - PADDLE_ENFORCE(op_dev_id != -1, - "Can not schedule the RPC operator to the right place."); - if (node->Op()->Type() == "recv") { - auto recv_vars_attr = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] - if (recv_vars_attr[0].find(".block") == std::string::npos) { - bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); - } - } - is_dist_train = true; - } else if (boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kDist)) { - int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); - if (node->Op()->Type() == "concat") { - auto origin_param_name = node->Op()->OutputArgumentNames()[0]; - bcast_var_name_set[op_dev_id].emplace(origin_param_name); - } - } else if (IsScaleLossOp(node)) { - // user can customize loss@grad if not use_default_grad_scale_ - if (strategy_.gradient_scale_ != - BuildStrategy::GradientScaleStrategy::kCustomized) { - // TODO(paddle-dev): Why is there no input for this op_handle? - auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType(); - CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0], - out_dtype); - } - // This assumes the backward generating code will ensure IsScaleLossOp - // is true only for the op that scale the final scalar loss. - // It also assumes backward op will always follow the forward op in - // the block. - is_forwarding = false; + if (DealWithSpecialOp(&result, node)) { + continue; } else { - int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); - if (op_dev_id != -1) { // This op only runs on one specific device. - CreateComputationalOp(&result, node, op_dev_id); - for (ir::Node *n : node->outputs) { - sharded_var_device.emplace(n->Name(), op_dev_id); - } + // This op runs on all devices + if (IsScaleLossOp(node)) { + // user can customize loss@grad if not use_default_grad_scale_ + InsertScaleLossGradOp(&result, node); + // This assumes the backward generating code will ensure IsScaleLossOp + // is true only for the op that scale the final scalar loss. + // It also assumes backward op will always follow the forward op in + // the block. + is_forwarding = false; } else { - // This op runs on all devices, and its output may have parameter's - // gradients. - // TODO(paddle-dev): Why is so special about "read" op? - if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) { - node->Op()->SetAttr("throw_eof_exp", false); - CreateComputationalOps(&result, node, places_.size()); - const auto &data_var_names = node->Op()->Output("Out"); - InsertDataBalanceOp(&result, data_var_names); - } else { - CreateComputationalOps(&result, node, places_.size()); - } + CreateComputationalOps(&result, node, places_.size()); + } + + // Insert collection ops + if (!is_forwarding && insert_collection_ops) { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; - if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward))) { - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } - } - } catch (boost::bad_get e) { - } + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + + InsertCollectiveOp(&result, p_name, g_name); } + } catch (boost::bad_get e) { } } } } - bool use_gpu = false; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - use_gpu = nccl_ctxs_ != nullptr; -#endif - // Insert broadcast operators principle: - // 1. Broadcast optimized parameters in Reduce strategy; - // 2. No need broadcast optimized parameters in AllReduce strategy because of - // the optimization sub-graph would be run on every GPU; - // 3. Allways broadcast received parameters in Distribute Training. - if ((use_gpu && - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || - is_dist_train) { - if (strategy_.fuse_broadcast_op_) { - CreateFusedBroadcastOp(&result, bcast_var_name_set); - } else { - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); - } - } - } - } + InsertPostprocessOps(&result); + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. - */ + */ PolishGraphToSupportDataHazards(&result); /* @@ -469,15 +230,77 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( + ir::Graph *result, const ir::Node *node) const { + // user can customize loss@grad if not use_default_grad_scale_ + size_t loss_scale = 0; + switch (this->strategy_.gradient_scale_) { + case BuildStrategy::GradientScaleStrategy::kOne: + loss_scale = 1; + break; + case BuildStrategy::GradientScaleStrategy::kCoeffNumDevice: + loss_scale = Get(kNRanks); + break; + case BuildStrategy::GradientScaleStrategy::kCustomized: + loss_scale = 0; + break; + default: + LOG(FATAL) << "Unknown gradient scale strategy."; + break; + } + + if (loss_scale) { + // TODO(paddle-dev): Why is there no input for this op_handle? + auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; + auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType(); + this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0], + loss_scale, out_dtype); + } +} + +std::vector MultiDevSSAGraphBuilderBase::SortOperations( + const ir::Graph &graph) const { + return ir::TopologySortOperations(graph); +} + +bool MultiDevSSAGraphBuilderBase::UseGPU() const { + bool use_gpu = false; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + use_gpu = nccl_ctxs_ != nullptr; +#endif + return use_gpu; +} + +bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { + return Get(kNRanks) > 1; +} + +void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { + auto p = places_[place_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + for (ir::Node *input : node->inputs) { + VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); + op_handle->AddInput(var); + } + + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); } - return false; } -void MultiDevSSAGraphBuilder::SetCommunicationContext( +void MultiDevSSAGraphBuilderBase::SetCommunicationContext( OpHandleBase *op_handle, const platform::Place &p) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_ == nullptr) { @@ -490,9 +313,9 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( #endif } -void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, - const std::string &p_name, - size_t src_dev_id) const { +void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, + const std::string &p_name, + size_t src_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), @@ -520,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, } } -void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( +void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) @@ -558,17 +381,17 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( } } -void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, - ir::Node *node, - int dev_id) const { +void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, + ir::Node *node, + int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, - const std::string &og) const { +void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( + ir::Graph *result, const std::string &og) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -596,77 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, } } -void MultiDevSSAGraphBuilder::InsertDataBalanceOp( - ir::Graph *result, const std::vector &datas) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( - result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), - local_scopes_, places_, nccl_ctxs_)); -#else - result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( - result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), - local_scopes_, places_)); -#endif - auto *op_handle = result->Get(kGraphOps).back(); - for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - SetCommunicationContext(op_handle, p); - for (const std::string &d_name : datas) { - auto &vars = result->Get(kGraphVars)[i][d_name]; - PADDLE_ENFORCE(!vars.empty()); - op_handle->AddInput(vars.back()); - auto var = new VarHandle( - result->CreateEmptyNode(d_name, ir::Node::Type::kVariable), - vars.size(), i, d_name, p); - vars.emplace_back(var); - op_handle->AddOutput(var); - } - } -} - -int MultiDevSSAGraphBuilder::GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, - const std::unordered_map &sharded_var_device) const { - if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { - return -1; - } - int op_role = boost::get( - node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - if (op_role != static_cast(framework::OpRole::kOptimize)) { - return -1; - } - auto param_grad = boost::get>( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device); - PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", - node->Op()->Type(), param_grad[0], param_grad[1]); - return dev_id; -} - -int MultiDevSSAGraphBuilder::GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, - const std::unordered_map &sharded_var_device) const { - auto got = sharded_var_device.find(varname); - if (got == sharded_var_device.end()) { - auto pos = varname.find(framework::kNewGradSuffix); - if (pos != std::string::npos) { - got = sharded_var_device.find(varname.substr(0, pos)); - } - } - return got == sharded_var_device.end() ? -1 : got->second; -} - -void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( +void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node, proto::VarType::Type dtype) const { + ir::Node *out_var_node, size_t loss_scale, + proto::VarType::Type dtype) const { for (size_t i = 0; i < places_.size(); ++i) { - // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype); + loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -680,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( } } -void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, - ir::Node *node, - size_t num_places) const { +void MultiDevSSAGraphBuilderBase::CreateComputationalOps( + ir::Graph *result, ir::Node *node, size_t num_places) const { for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; @@ -692,9 +452,9 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, } } -VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, - const std::string &og, - int dst_dev_id) const { +VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, + const std::string &og, + int dst_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), @@ -723,53 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -int MultiDevSSAGraphBuilder::CreateDistTrainOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const { - int op_dev_id = -1; - std::vector input_var_names; - std::vector output_var_names; - for (ir::Node *input : node->inputs) { - input_var_names.push_back(input->Name()); +bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { + return boost::get( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)) && + !loss_var_name_.empty(); // If loss_var is empty. This is test mode +} + +bool MultiDevSSAGraphBuilderBase::IsSparseGradient( + const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; } - for (ir::Node *output : node->outputs) { - output_var_names.push_back(output->Name()); + return false; +} + +void AllReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); } +} - if (node->Op()->Type() == "split_byref" || - node->Op()->Type() == "split_selected_rows" || - node->Op()->Type() == "split_ids") { - // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); - if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - op_dev_id = GetAppropriateDeviceID(input_var_names); - for (auto &varname : input_var_names) { - sharded_var_device->emplace(varname, op_dev_id); +int BalanceVarSSAGraphBuilder::GetVarDeviceID( + const std::string &varname) const { + auto got = sharded_var_device_.find(varname); + if (got == sharded_var_device_.end()) { + auto pos = varname.find(framework::kNewGradSuffix); + if (pos != std::string::npos) { + got = sharded_var_device_.find(varname.substr(0, pos)); + } + } + return got == sharded_var_device_.end() ? -1 : got->second; +} + +int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", + node->Op()->Type(), param_grad[0], param_grad[1]); + return dev_id; +} + +size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; +} + +void BalanceVarSSAGraphBuilder::ResetState() const { + balance_vars_.clear(); + sharded_var_device_.clear(); + + balance_vars_.resize(places_.size(), 0); +} + +void ReduceSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void ReduceSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +void ReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + bcast_var_name_set_[cur_device_id].emplace(p_name); +} + +bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node); + if (op_dev_id != -1) { + // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + return true; + } + return false; +} + +void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (UseGPU()) { + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } } } - for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + } +} + +int ReduceSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, + std::unordered_map> *delay_ops) const { + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +std::vector ReduceSSAGraphBuilder::SortOperations( + const ir::Graph &graph) const { + std::vector sorted_ops = ir::TopologySortOperations(graph); + return SortForReduceMode(sorted_ops); +} + +std::vector ReduceSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + ResetState(); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device_.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); } - } else if (node->Op()->Type() == "concat") { - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); - for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. } - } else { - LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); - PADDLE_THROW( - "the distribute training related op should be in [split_byref, " - "concat]."); } - PADDLE_ENFORCE(op_dev_id != -1, - "can not find right place for distributed op: %s", - node->Op()->Type()); + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); - CreateComputationalOp(result, node, op_dev_id); - return op_dev_id; + ResetState(); + return sorted_ops; +} + +void DistSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void DistSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + bool insert_op = false; + if (OpHaveRole(*node, OpRole::kRPC)) { + int op_dev_id = CreateRPCOp(result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]); + } + } + insert_op = true; + need_broadcast_var_ = true; + } else if (OpHaveRole(*node, OpRole::kDist)) { + int op_dev_id = CreateDistTrainOp(result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set_[op_dev_id].emplace(origin_param_name); + } + insert_op = true; + } else { + int op_dev_id = GetOpDeviceID(node); + if (op_dev_id != -1) { // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + insert_op = true; + } + } + return insert_op; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -788,14 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -int MultiDevSSAGraphBuilder::CreateRPCOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const { +int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -813,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( VLOG(10) << "send grad " << input_var_names[0] << " origin " << send_param_grad[1] << " place: " << op_dev_id; for (auto &varname : input_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + sharded_var_device_.emplace(varname, op_dev_id); } - sharded_var_device->emplace(send_param_grad[1], op_dev_id); + sharded_var_device_.emplace(send_param_grad[1], op_dev_id); } } else if (node->Op()->Type() == "recv") { std::vector output_var_names; @@ -825,8 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = - GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -834,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( op_dev_id = GetAppropriateDeviceID(output_var_names); } for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + sharded_var_device_.emplace(varname, op_dev_id); } } else { // send_barrier, fetch_barrier will run on place 0; @@ -861,8 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = - GetVarDeviceID(*result, output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name()); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -879,21 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } -bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { - return boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss)) && - !loss_var_name_.empty(); // If loss_var is empty. This is test mode +int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = -1; + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); + } + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + if (node->Op()->Type() == "split_byref" || + node->Op()->Type() == "split_selected_rows" || + node->Op()->Type() == "split_ids") { + // TODO(paddle-dev): getting the first var is not safe. + op_dev_id = GetVarDeviceID(input_var_names[0]); + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + op_dev_id = GetAppropriateDeviceID(input_var_names); + for (auto &varname : input_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else if (node->Op()->Type() == "concat") { + op_dev_id = GetVarDeviceID(input_var_names[0]); + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else { + LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); + PADDLE_THROW( + "the distribute training related op should be in [split_byref, " + "concat]."); + } + + PADDLE_ENFORCE(op_dev_id != -1, + "can not find right place for distributed op: %s", + node->Op()->Type()); + + CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; +} + +void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, + const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = 0; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy."; + break; + } } + +void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (need_broadcast_var_ || + (UseGPU() && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) { + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } + } + } + } +} + +std::unordered_set &MultiDevSSAGraphBuilder() { + static std::unordered_set regs; + return regs; +} + +static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { + MultiDevSSAGraphBuilder().insert(builder_mode); + return 0; +} + } // namespace details } // namespace framework } // namespace paddle -REGISTER_PASS(multi_devices_pass, - paddle::framework::details::MultiDevSSAGraphBuilder) - .RequirePassAttr(paddle::framework::details::kLossVarName) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes) - .RequirePassAttr(paddle::framework::details::kStrategy) - .RequirePassAttr(paddle::framework::details::kNumTrainers); +#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + _reg_ssa_graph_builder_##pass_name, \ + "REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \ + int _reg_ssa_graph_builder_entry_##pass_name = \ + paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \ + REGISTER_PASS(pass_name, pass_class) \ + .RequirePassAttr(paddle::framework::details::kLossVarName) \ + .RequirePassAttr(paddle::framework::details::kPlaces) \ + .RequirePassAttr(paddle::framework::details::kLocalScopes) \ + .RequirePassAttr(paddle::framework::details::kStrategy) \ + .RequirePassAttr(paddle::framework::details::kNRanks) + +REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, + paddle::framework::details::ReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS( + allreduce_mode_multi_devices_pass, + paddle::framework::details::AllReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, + paddle::framework::details::DistSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 5736102ddc13418446013307cf8204b677f960dc..6d4386538ea7d0cc318647c92282af9d598fa699 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include #include @@ -30,84 +31,154 @@ namespace framework { class Scope; namespace details { -class MultiDevSSAGraphBuilder : public ir::Pass { +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kPlaces[] = "places"; +constexpr char kLocalScopes[] = "local_scopes"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + +class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - private: - void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, - size_t device_id) const; - void Init() const; + virtual void Init() const; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - mutable platform::NCCLContextMap *nccl_ctxs_; -#endif + virtual std::vector SortOperations(const ir::Graph &graph) const; - int GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, - const std::unordered_map &sharded_var_device) const; + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const = 0; - bool IsScaleLossOp(ir::Node *node) const; + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; - int CreateRPCOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const; - int CreateDistTrainOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const; + virtual void InsertPostprocessOps(ir::Graph *result) const = 0; - std::vector FindDistTrainSendVars( - const std::vector &nodes) const; + bool UseGPU() const; - std::vector FindDistTrainRecvVars( - const std::vector &nodes) const; + bool NeedCollectiveOps() const; + + bool IsScaleLossOp(ir::Node *node) const; void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; void CreateScaleLossGradOp(ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node, + ir::Node *out_var_node, size_t loss_scale, proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; + void CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const; - int GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, - const std::unordered_map &sharded_var_device) const; - - void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; + bool IsSparseGradient(const std::string &og) const; - void InsertDataBalanceOp(ir::Graph *result, - const std::vector &datas) const; + void CreateAllReduceOp(ir::Graph *result, const std::string &og) const; void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; + void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const; + void CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const; - bool IsSparseGradient(const std::string &og) const; - - size_t GetAppropriateDeviceID( - const std::vector &var_names) const; - void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; + void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, + size_t device_id) const; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + mutable platform::NCCLContextMap *nccl_ctxs_; +#endif + mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; +}; + +class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { + return false; + } + + virtual void InsertPostprocessOps(ir::Graph *result) const {} +}; + +class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + int GetVarDeviceID(const std::string &varname) const; + + int GetOpDeviceID(ir::Node *node) const; + + size_t GetAppropriateDeviceID( + const std::vector &var_names) const; + + virtual void ResetState() const; + + mutable std::unordered_map sharded_var_device_; mutable std::vector balance_vars_; }; + +class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual std::vector SortOperations(const ir::Graph &graph) const; + + virtual void ResetState() const; + + int GetOpDeviceID(ir::Node *node, + std::unordered_map> + *delay_ops) const; + + std::vector SortForReduceMode( + const std::vector &topo_ops) const; + + mutable std::vector> bcast_var_name_set_; +}; + +class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual void ResetState() const; + + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + + mutable std::vector> bcast_var_name_set_; + mutable bool need_broadcast_var_{false}; +}; + +std::unordered_set &MultiDevSSAGraphBuilder(); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..128aaa33a2c60e62fdca13768cdc0a815167f3ef --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::vector> &&graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), + places_(std::move(places)), + graphs_(std::move(graphs)) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to run the operators of the graph on each device."; + for (size_t i = 0; i < places.size(); ++i) { + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); + } +} + +FeedFetchList ParallelSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + + std::vector fetch_data; + FeedFetchList ret; + + fetch_data.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); + }; + + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + fetch_data.emplace_back(std::move(call())); + } + } + + if (pool_) { + for (auto &f : run_futures) { + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + fetch_data.emplace_back(std::move(f.get())); + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx)); + } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + return ret; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..c00c5bc2d1b4b78593f99c819b5a3d642150e773 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelSSAGraphExecutor : public SSAGraphExecutor { + public: + ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector> &&graphs); + ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; + std::vector places_; + std::vector> graphs_; + + std::vector> executors_; + ExceptionHolder exception_holder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 57f6fc66c57e2a53d9cf30d7761626a50bc379ea..91e4f9adb418978c30f512abe6924c0ace182124 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } } std::vector fetch_data; - std::exception_ptr eptr; + std::exception_ptr eptr = nullptr; try { fetch_data = underlying_executor_->Run(fetch_tensors); } catch (...) { @@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); - drop_scope_counter_ += 1; + ++drop_scope_counter_; - if (!fetch_tensors.empty() || - drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - drop_scope_counter_ = 0; - // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); + bool stream_end = false; + if (!fetch_tensors.empty()) { + WaitComputationalStreams(); + stream_end = true; + } + + if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + if (!stream_end) { + WaitComputationalStreams(); } + for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } + + drop_scope_counter_ = 0; } if (eptr) { std::rethrow_exception(eptr); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50b51d2b630aba06a5907dd721754d1f..0f6340213daee98a75401f9db0e628f7b4fd79fc 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; + private: + inline void WaitComputationalStreams() { + // Wait All computational streams + for (auto p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + } + private: size_t drop_scope_counter_{0}; diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 3dfd14419d94379a0bf79f55d7a139acd77cbd7e..134f759081a0778194c20785e215420d6e2bb622 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) { } else if (var->IsType()) { (*func)(var->GetMutable()); } else { - PADDLE_THROW("Not supported type %s", var->Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); } } @@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) { } else if (var.IsType()) { (*func)(var.Get()); } else { - PADDLE_THROW("Not supported type %s", var.Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 73f92fa389fa3a66a14ae60b8dbfbcae80485658..88aee8379d835ce88b6b348aca99eb4a35bbeb5c 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -16,332 +16,184 @@ #include #include #include +#include #include +#include "paddle/fluid/framework/array.h" #include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { namespace framework { // Statically sized, statically indexed dimension -template -struct Dim { - static constexpr int dimensions = i; +template +class Dim : public Array { + public: + static_assert(D >= 0, "D must be not less than 0"); - template - HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { - static_assert(sizeof...(_tail) == i - 1, - "Dim initialized with the wrong number of parameters"); - } + static constexpr int kRank = D; + using BaseClass = Array; - HOSTDEVICE - Dim(int64_t _head, const Dim& _tail) : head(_head), tail(_tail) {} + inline Dim(int64_t head, const Dim& tail) { + (*this)[0] = head; + new (this->GetMutable() + 1) Dim(tail); + } - HOSTDEVICE - Dim() : head(0), tail() {} + template + HOSTDEVICE explicit Dim(int64_t head, Args... args) + : BaseClass(head, args...) {} /** Construct a Dim from a linear index and size. Uses Fortran order * indexing. */ - HOSTDEVICE - Dim(int64_t idx, const Dim& size) - : head(idx % size.head), tail(idx / size.head, size.tail) {} + HOSTDEVICE Dim(int64_t idx, const Dim& size); /** Construct a Dim with each dimension set to the given index */ - HOSTDEVICE - Dim(int64_t idx) : head(idx), tail(idx) {} + HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } - HOSTDEVICE - bool operator==(const Dim& o) const { - return (head == o.head) && (tail == o.tail); - } - - HOSTDEVICE - bool operator!=(const Dim& o) const { return !(*this == o); } - - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; + HOSTDEVICE Dim() = default; HOST std::string to_string() const; - - int64_t head; - Dim tail; }; -// Base case specialization -template <> -struct Dim<0> { - static constexpr int dimensions = 0; - - HOSTDEVICE - Dim(int64_t _head) {} - - HOSTDEVICE - Dim() {} - - HOSTDEVICE - Dim(int idx, const Dim<0>& size) { -#ifndef __CUDA_ARCH__ - if (idx > 0) { - throw std::invalid_argument("Index out of range."); - } -#else - PADDLE_ASSERT(idx == 0); -#endif - } - - HOSTDEVICE - bool operator==(const Dim<0>& o) const { return true; } - - HOSTDEVICE - bool operator!=(const Dim<0>& o) const { return false; } - - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; -}; - -namespace { - -// Helper for accessing Dim classes -template -struct DimGetter { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return DimGetter::impl(d.tail); - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return DimGetter::impl(d.tail); +namespace detail { +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) { + out[kStart] = (*idx) % in[kStart]; + (*idx) /= in[kStart]; + FortranOrderIndexingConstructorFunctor::Run(in, idx, + out); } }; -// Eureka! We found the element! -template <> -struct DimGetter<0> { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return d.head; - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return d.head; - } +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) {} }; +} // namespace detail template -HOSTDEVICE int64_t& indexer(Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif -} - -template -HOSTDEVICE int64_t indexer(const Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif -} - -} // namespace -// Static access to constant Dim -template -HOSTDEVICE int64_t get(const Dim& d) { - return DimGetter::impl(d); +HOSTDEVICE Dim::Dim(int64_t idx, const Dim& size) { + detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run( + size.Get(), &idx, this->GetMutable()); } -// Static access to mutable Dim -template -HOSTDEVICE int64_t& get(Dim& d) { - return DimGetter::impl(d); +template +HOSTDEVICE inline int64_t get(const Dim& dim) { + return dim[idx]; } -// Dynamic access to constant Dim -template -HOSTDEVICE int64_t Dim::operator[](int i) const { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t& get(Dim& dim) { // NOLINT + return dim[idx]; } -// Dynamic access to mutable Dim -template -HOSTDEVICE int64_t& Dim::operator[](int i) { - return indexer(*this, i); -} - -// Dynamic access to constant Dim -inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const { - return indexer(*this, i); -} - -// Dynamic access to mutable Dim -inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) { - return indexer(*this, i); -} - -// Dynamic access to constant Dim -// without std::enable_if will try to instantiate this on get<0>(d) -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, - int i) { - return d[i]; +template +HOSTDEVICE inline int64_t get(const Dim& dim, int idx) { + return dim[idx]; } -// Dynamic access to mutable Dim -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, - int i) { - return d[i]; +template +HOSTDEVICE inline int64_t& get(Dim& dim, int idx) { // NOLINT + return dim[idx]; } // Dot product of two dims -template -HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) { - return a.head * b.head + linearize(a.tail, b.tail); -} - -// Base case dot product of two Dims -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) { - return 0; +template +HOSTDEVICE inline int64_t linearize(const Dim& a, const Dim& b) { + return UnrollProduct::Run(a.Get(), b.Get()); } // Product of a Dim -template -HOSTDEVICE int64_t product(const Dim& a, int prod = 1) { - return prod * a.head * product(a.tail); -} - -// Base case product of a Dim -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) { - return prod; +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); } // Is 0 <= idx_i < size_i for all i? -template -HOSTDEVICE bool contained(const Dim& idx, const Dim& size) { - return ((0 <= idx.head) && (idx.head < size.head) && - contained(idx.tail, size.tail)); -} +namespace detail { +template +struct ContainedFunctor { + HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) { + return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) && + ContainedFunctor::Run(idx, + size); + } +}; -// Base case of is 0 <= idx_i < size_i ? -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) { - return true; +template +struct ContainedFunctor { + HOSTDEVICE static constexpr inline bool Run(const int64_t* idx, + const int64_t* size) { + return true; + } +}; +} // namespace detail + +template +HOSTDEVICE inline bool contained(const Dim& idx, const Dim& size) { + return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get()); } /** * \brief Compute exclusive prefix-multiply of a Dim. */ -template -HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) { - return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); -} +namespace detail { +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) { + kStart == 0 ? out[kStart] = 1 : out[kStart] = + out[kStart - 1] * in[kStart - 1]; + detail::ExPrefixMulFunctor::Run(in, + out); + } +}; + +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {} +}; +} // namespace detail -///\cond HIDDEN -// Base case of ex_prefix_mul -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) { - return Dim<0>(); +template +HOSTDEVICE inline Dim ex_prefix_mul(const Dim& src) { + Dim ret; + detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable()); + return ret; } -///\endcond /** * Add two dimensions together */ -template -HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) { - return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); -} - -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); +template +HOSTDEVICE inline Dim dim_plus(const Dim& a, const Dim& b) { + Dim ret; + UnrollAdd::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } -template -HOSTDEVICE Dim operator+(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE inline Dim operator+(const Dim& lhs, const Dim& rhs) { return dim_plus(lhs, rhs); } /** * Multiply two dimensions together */ -template -HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) { - return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); -} - -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); +template +HOSTDEVICE inline Dim dim_mult(const Dim& a, const Dim& b) { + Dim ret; + UnrollMul::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } -template -HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { return dim_mult(lhs, rhs); } @@ -354,23 +206,32 @@ HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { * \return Dim object the same size as \p size with normalized strides * */ +namespace detail { +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) { + ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]); + NormalizeStridesFunctor::Run( + size, stride, ret); + } +}; -template -HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { - int norm_stride = size.head == 1 ? 0 : stride.head; - return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); -} - -///\cond HIDDEN +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) {} +}; +} // namespace detail -template <> -HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, - const Dim<0>& stride) { - return Dim<0>(); +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + Dim ret; + detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(), + ret.GetMutable()); + return ret; } -///\endcond - /** * Helper function to create a Dim * @@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, */ template -HOSTDEVICE Dim make_dim(Args... idxes) { +HOSTDEVICE inline Dim make_dim(Args... idxes) { return Dim(idxes...); } // Allows us to output a Dim -// XXX For some reason, overloading fails to resolve this correctly -template -typename std::enable_if<(i > 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head << ", " << d.tail; - return os; -} - -// Base case that allows us to output a Dim -// XXX I wish this could be an overload instead of a template -template -typename std::enable_if<(i == 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head; +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { + os << d[0]; + for (int i = 1; i < D; ++i) { + os << ", " << d[i]; + } return os; } @@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { return os; } -template -HOST std::string Dim::to_string() const { +template +HOST std::string Dim::to_string() const { std::stringstream stream; - stream << *this; - return stream.str(); } template -HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { +HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { Dim result; for (int i = 0; i < D - 1; ++i) { @@ -428,5 +279,10 @@ HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { return result; } +template +inline void static_dim_assign(const T1* in, T2* out) { + UnrollAssign::Run(in, out); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index eaef093ed3b6e4a1a347adc7b221b800c7f539af..39652706c43fb51da99170b361b3e1a6e04c6fc9 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CPUPlace &place) const { - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPU; ctx.device_id = 0; return ctx; @@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLGPU; ctx.device_id = place.device; return ctx; @@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPUPinned; ctx.device_id = 0; return ctx; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index 0c52bce1ef6af9b92bcb9f87c6781de878ed5898..e48b0d5c88fecf797a61283b004735fdcbabb329 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -38,7 +38,7 @@ class DLPackTensor { // The shape in DLTensor is defined as int64_t* // Add this member to make TVMTensor init without heap allocation - ShapeType shape_[9]; + ShapeType shape_[DDim::kMaxRank]; }; } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index da9556c6c1f3468208db02f2958ad6ad137c6566..c93bbe7ceecce9193acfae0b4e03c06212edd6d6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -119,7 +119,7 @@ static void DeleteUnusedTensors( } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } } diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 2eb9e564f87807e88def536ee875ebe0d1e83cd6..4972bc7ec3a90f8cebea19bcaf320813f7e50e39 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/pybind/pybind.h" namespace paddle { namespace framework { @@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() { return; #else static unsigned concurrency_cap = std::thread::hardware_concurrency(); + LOG(WARNING) << "concurrency capacity " << concurrency_cap; int thread_id = this->thread_id_; if (static_cast(thread_id) < concurrency_cap) { @@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); } +void ExecutorThreadWorker::TrainFilesWithTimer() { + platform::SetNumThreads(1); + SetDevice(); + thread_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + int cur_batch; + int batch_cnt = 0; + timeline.Start(); + while ((cur_batch = thread_reader_->Next()) > 0) { + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + for (size_t i = 0; i < ops_.size(); ++i) { + timeline.Start(); + ops_[i]->Run(*thread_scope_, place_); + timeline.Pause(); + op_total_time[i] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + ++batch_cnt; + thread_scope_->DropKids(); + if (thread_id_ == 0) { + if (batch_cnt > 0 && batch_cnt % 1000 == 0) { + for (size_t i = 0; i < ops_.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + int fetch_var_num = fetch_var_names_.size(); + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } + } + } + timeline.Start(); + } +} + void ExecutorThreadWorker::TrainFiles() { platform::SetNumThreads(1); @@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr( std::shared_ptr pslib_ptr) { _pslib_ptr = pslib_ptr; } + void AsyncExecutorThreadWorker::SetPullDenseThread( std::shared_ptr dpt) { _pull_dense_thread = dpt; } + void AsyncExecutorThreadWorker::TrainOneNetwork() { PrepareParams(); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 30b81ad88035eacc7a8efbe6d20f03d362122003..524922b0322e538d46f93011fbca3223b02d8849 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,6 +155,8 @@ class ExecutorThreadWorker { void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function virtual void TrainFiles(); + // with timer log + virtual void TrainFilesWithTimer(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); #ifdef PADDLE_WITH_PSLIB diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b7f7e2ee8ef590c0d0d8307de4400a8ce8ad4e7d..6e6db3d3efbc9fbb17e7ee45402dd4cb7f4f7a34 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) +pass_library(lock_free_optimize_pass base) pass_library(fc_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) @@ -45,6 +46,7 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) +pass_library(conv_affine_channel_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7bfb8cf1ee09e78051e2f140c9a7ab4c40db60c --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* Affine Channel inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ + /* Affine channel outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ + +void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, + const ir::Node& ac_scale, + const LoDTensor& ac_bias_tensor, + LoDTensor* eltwise_y_in_tensor) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from AffineChannel + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), + ac_bias_tensor.numel(), 1); + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; + + // Re-compute weight of conv2d from AffineChannel + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= scale_array; +} + +std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvAffineChannel fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+affinechannel fuse"; + return; + } + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Get affine_channel bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({ac_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, ac_out); + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_ac_count); + return graph; +} + +std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); + + GraphSafeRemoveNodes(graph.get(), + {ac_scale, ac_bias, affine_channel, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, ac_out); + + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + AddStatis(found_conv_ac_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_affine_channel_fuse_pass, + paddle::framework::ir::ConvAffineChannelFusePass); +REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, + paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..ad966e11e6222a4ed4c730089c454b0d1c7bd0b3 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and ConvAffineChannel. + */ +class ConvAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_affine_channel_fuse"}; +}; + +class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index 23f343f631628b432d91d4504019895ed4bac4a5..c6121777e8d2c32193b5c170bb0fa3f0337c9bc3 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc( const std::string& output) { auto proto = base_desc; framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); desc.SetInput("Bias", {bias}); desc.SetInput("ResidualData", {bias1}); desc.SetAttr("activation", activation); desc.SetOutput("Output", {output}); desc.SetAttr("is_test", true); - + desc.SetAttr("use_cudnn", false); + desc.Flush(); return *desc.Proto(); } std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr graph) const { - const std::string pattern_name = "conv_elementwise_add_act_fuse"; + const std::string pattern_name = "conv_elementwise_add2_act_fuse"; FusePassBase::Init(pattern_name, graph.get()); GraphPatternDetector gpd; @@ -76,22 +78,23 @@ std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( framework::OpDesc new_op_desc(new_op_proto, nullptr); // Create a new node for the fused op. - graph->CreateOpNode(&new_op_desc); + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); // Link inputs and outputs. PADDLE_ENFORCE(subgraph.count(x)); auto* conv_in_node = subgraph.at(x); - IR_NODE_LINK_TO(conv_in_node, conv_op); // Input - IR_NODE_LINK_TO(conv_filter, conv_op); // Filter - IR_NODE_LINK_TO(conv_op, conv_out); // Output - IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias - IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), - {conv_op, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out}); + GraphSafeRemoveNodes( + graph.get(), + {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; gpd(graph.get(), handler); return graph; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8670dcfed7e40473e06cd12cecc1157dd4f54aa0..3eb5bdba3b7275f45cdfc6ad47f75e7a423541d0 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,66 +23,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -namespace { - -void CheckProgram(const ProgramDesc &program) { -#define _INT(role) static_cast(role) - - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } - -#undef _INT -} -} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { - CheckProgram(program_); auto var_nodes = InitFromProgram(program_); ResolveHazard(var_nodes); } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 13d752e5167c039ec8d9e4300b190a726bb02a63..c513fe2dd8f5733c87802f6fa9980ad885dfd865 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } -std::unordered_set conv_act_set({"identity", "sigmoid", "relu", - "relu6", "relux", "tanh", - "band_pass"}); +std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); @@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ->AsInput(); auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) ->assert_is_op_output("elementwise_add") - ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_input("elementwise_add", "Y") ->AsIntermediate(); auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) ->assert_is_op("elementwise_add"); auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) - ->assert_is_op_input("elementwise_add", "Y") + ->assert_is_op_input("elementwise_add", "X") ->AsInput(); auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) ->assert_is_op_output("elementwise_add") @@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) .LinksTo({elementwise_add_out}); - elementwise_add_op_1->LinksFrom( - {elementwise_add_out, elementwise_add_in_y_1}); + elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1}) + .LinksTo({elementwise_add_out_1}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); return act_out; } @@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { return elementwise_add_out; } +PDNode *patterns::ConvAffineChannel::operator()( + paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + + auto *affine_channel_op = + pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as AffineChannel input + conv_out_var->assert_is_op_input("affine_channel", "X"); + } + + // AC Scale + auto *ac_scale_var = pattern->NewNode(ac_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Scale"); + // AC Bias + auto *ac_bias_var = pattern->NewNode(ac_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Bias"); + + // AC output + auto *ac_out_var = pattern->NewNode(ac_out_repr()) + ->AsOutput() + ->assert_is_op_output("affine_channel"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } else { + affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } + return ac_out_var; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index eaedd9d08e0fab820481d6eaacb6e7bfc1ab6d1d..61a53003449710da2a52c90197c9f2f3ac56c7bb 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_out); }; +// Conv with affine_channel +// op: conv + (elementwise_add +) affine_channel +// named nodes: +// conv_weight, conv_out, conv, +// ac_x, ac_scale, ac_bias +// affine_channel, ac_out +struct ConvAffineChannel : public PatternBase { + ConvAffineChannel(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_affine_channel") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(affine_channel); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + + // AC(Affine_Channel) inputs + PATTERN_DECL_NODE(ac_scale); + PATTERN_DECL_NODE(ac_bias); + // AC outputs + PATTERN_DECL_NODE(ac_out); // Out +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..92e897ca9ce02ed67f026fd08062842e3bafa098 --- /dev/null +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc @@ -0,0 +1,358 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +const char kSumGradOpName[] = "sum"; +// TODO(minqiyang): only support sgd at current time, please add +// other optimizers later. +const char kOptimizerType[] = "sgd"; + +std::unique_ptr LockFreeOptimizePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + + // We could collect all weights' name from SGD, where + // W1 <- SGD(W0, Grad0) + std::unordered_set weight_var_set; + for (auto* node : graph->Nodes()) { + if (IsOpNamed(node, kOptimizerType)) { + auto& param_out_vars = node->Op()->Output("ParamOut"); + PADDLE_ENFORCE(param_out_vars.size() == 1u); + weight_var_set.insert(param_out_vars[0]); + } + } + + // find all grad's merge op via weight name, where + // Grad0 <- SUM(Grad1, Grad2, Grad3 ...) + std::unordered_set grad_sum_op_set; + for (ir::Node* node : graph->Nodes()) { + if (IsOpNamed(node, kSumGradOpName)) { + for (ir::Node* output : node->outputs) { + // strip the last grad suffix @GRAD + std::string var_name = output->Name(); + const std::string suffix(kGradVarSuffix); + if (var_name != suffix && var_name.size() > suffix.size() && + var_name.substr(var_name.size() - suffix.size()) == suffix) { + // if so then strip them off + var_name = var_name.substr(0, var_name.size() - suffix.size()); + if (weight_var_set.find(var_name) != weight_var_set.end()) { + grad_sum_op_set.insert(node); + break; + } + } + } + } + } + + // get the forward op and backward op pairs, where + // out <- forward(X, W) + // Grad1 <- backward(out, X') + // Grad0 <- SUM(Grad1, Grad2, Grad3 ...) + // W0 <- SGD(W1, Grad0) + for (ir::Node* node : grad_sum_op_set) { + for (ir::Node* merged_grad_var : node->outputs) { + // find the optimizers connected with sum op + if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) && + merged_grad_var->outputs.size() == 1u) { + ir::Node* opt_node = merged_grad_var->outputs[0]; + VLOG(3) << "Found opt node " << opt_node->Name(); + + // find the backward op connected with sum op + for (ir::Node* unmerged_grad_var : node->inputs) { + if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) && + unmerged_grad_var->inputs.size() == 1u) { + ir::Node* backward_op = unmerged_grad_var->inputs[0]; + + VLOG(3) << "Found backward_op " << backward_op->Name(); + + // find the forward op related to the backward op + ir::Node* forward_op = + FindForwardOpViaBackwardOp(graph.get(), backward_op); + + VLOG(3) << "Found forward_op " << forward_op->Name(); + + PADDLE_ENFORCE(forward_op); + + Node* new_optimizer_node = CreateNewSGDNode( + graph.get(), forward_op, backward_op, node, opt_node); + + PADDLE_ENFORCE(new_optimizer_node); + } + } + } + } + } + + // Remove the sum_op and its' outputs and connected Optimizers + for (Node* sum_op : grad_sum_op_set) { + for (Node* sum_op_output : sum_op->outputs) { + for (Node* optimize_op : sum_op_output->outputs) { + if (optimize_op->NodeType() == Node::Type::kOperation && + optimize_op->Name() == kOptimizerType) { + VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_" + << optimize_op->id(); + graph->RemoveNode(optimize_op); + } + } + VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_" + << sum_op_output->id(); + graph->RemoveNode(sum_op_output); + } + VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id(); + graph->RemoveNode(sum_op); + } + + for (auto* node : graph->Nodes()) { + for (Node* output_node : node->outputs) { + if (output_node->Name() == "sgd") { + VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id() + << " --> " << output_node->Name() << "_" << output_node->id(); + for (Node* input_node : node->inputs) { + VLOG(3) << "SGD Input link: " << input_node->Name() << "_" + << input_node->id() << " --> " << node->Name() << "_" + << node->id(); + } + } + } + } + + return graph; +} + +ir::Node* LockFreeOptimizePass::CreateNewSGDNode( + ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node, + ir::Node* grad_sum_node, ir::Node* optimize_node) const { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(forward_node); + PADDLE_ENFORCE(backward_node); + PADDLE_ENFORCE(grad_sum_node); + PADDLE_ENFORCE(optimize_node); + + // find the grad var node between the grad sum node and backward_node + std::vector grad_vars = + FindConnectedNode(backward_node, grad_sum_node); + ir::Node* grad_node = nullptr; + for (ir::Node* node : grad_vars) { + if (!ir::IsControlDepVar(*node)) { + grad_node = node; + } + } + PADDLE_ENFORCE(grad_node); + + // create a new SGD node + OpDesc* old_desc = optimize_node->Op(); + // keep with the same block between new optimizer and the old one + OpDesc new_desc(*old_desc, old_desc->Block()); + new_desc.SetInput("Param", old_desc->Input("Param")); + new_desc.SetInput("LearningRate", old_desc->Input("LearningRate")); + new_desc.SetInput("Grad", std::vector({grad_node->Name()})); + new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut")); + + std::vector op_role_vars = boost::get>( + new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName())); + // replace the second op role var, because the grad name was + // changed in new optimizer + op_role_vars.pop_back(); + op_role_vars.push_back(grad_node->Name()); + new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + op_role_vars); + new_desc.SetType(kOptimizerType); + + // set backward op's op role var, this will be used to + // set device_id in multi_device_pass + backward_node->Op()->SetAttr( + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars); + // backward_node->Op()->SetAttr( + // framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {}); + + // keep with the same output nodes between new optimizer and the + // old one + Node* sgd_node = graph->CreateOpNode(&new_desc); + + // change all outputs of the optimize_node to the new one + ReplaceAllDownstreamNode(optimize_node, sgd_node); + + // find connected node between forward node and optimize node + // and replace the optimize node to new sgd node + std::vector forward_opt_connected_nodes = + FindConnectedNode(forward_node, optimize_node); + for (ir::Node* node : forward_opt_connected_nodes) { + ReplaceUpstreamNode(node, optimize_node, sgd_node); + } + + // find connected node between backward node and optimize node + // and replace the optimize node to new sgd node + std::vector backward_opt_connected_nodes = + FindConnectedNode(backward_node, optimize_node); + for (ir::Node* node : backward_opt_connected_nodes) { + ReplaceUpstreamNode(node, optimize_node, sgd_node); + } + + // SGD must have only one param and LR in + PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u); + PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u); + + // LR and weight nodes should be copied + for (Node* upstream_node : optimize_node->inputs) { + if (upstream_node->Name() == old_desc->Input("LearningRate")[0] || + upstream_node->Name() == old_desc->Input("Param")[0]) { + ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node); + } + } + + VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id(); + + return sgd_node; +} + +std::vector LockFreeOptimizePass::FindConnectedNode( + ir::Node* upstream_node, ir::Node* downstream_node) const { + std::vector result; + for (ir::Node* out_node : upstream_node->outputs) { + for (ir::Node* in_node : downstream_node->inputs) { + if (in_node == out_node) { + result.push_back(in_node); + } + } + } + + return result; +} + +void LockFreeOptimizePass::ReplaceUpstreamNode( + ir::Node* upstream_node, ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const { + PADDLE_ENFORCE(upstream_node); + PADDLE_ENFORCE(old_optimizer_node); + PADDLE_ENFORCE(new_optimizer_node); + + // Remove the old_optimizer_node from upstream_node's outputs vector + auto& output_node_vec = upstream_node->outputs; + for (auto output_node_iter = output_node_vec.begin(); + output_node_iter != output_node_vec.end();) { + if (*output_node_iter == old_optimizer_node) { + output_node_vec.erase(output_node_iter); + break; + } else { + ++output_node_iter; + } + } + + // Add the new_optimizer_node to upstream_node's outputs vector + output_node_vec.emplace_back(new_optimizer_node); + new_optimizer_node->inputs.emplace_back(upstream_node); +} + +void LockFreeOptimizePass::ReplaceAllDownstreamNode( + ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const { + PADDLE_ENFORCE(old_optimizer_node); + PADDLE_ENFORCE(new_optimizer_node); + + for (ir::Node* downstream_node : old_optimizer_node->outputs) { + // Remove the old_optimizer_node from downstream_node's inputs vector + auto& input_node_vec = downstream_node->inputs; + for (auto input_node_iter = input_node_vec.begin(); + input_node_iter != input_node_vec.end();) { + if (*input_node_iter == old_optimizer_node) { + input_node_vec.erase(input_node_iter); + break; + } else { + ++input_node_iter; + } + } + + // Add the new_optimizer_node to downstream_node's inputs vector + input_node_vec.emplace_back(new_optimizer_node); + new_optimizer_node->outputs.emplace_back(downstream_node); + } +} + +ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp( + ir::Graph* graph, ir::Node* backward_node) const { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(backward_node); + + // strip the suffix _grad of backward_node's name + std::string forward_op_name = backward_node->Name(); + const std::string suffix("_grad"); + if (forward_op_name != suffix && forward_op_name.size() > suffix.size() && + forward_op_name.substr(forward_op_name.size() - suffix.size()) == + suffix) { + // if so then strip them off + forward_op_name = + forward_op_name.substr(0, forward_op_name.size() - suffix.size()); + } else { + LOG(WARNING) << "Illegal backward node's name " << backward_node->Name() + << " id " << backward_node->id(); + + return nullptr; + } + + for (ir::Node* node : graph->Nodes()) { + if (node->Name() == forward_op_name) { + if (node->outputs.size() == 0u) { + // if forward_node has no output, then it has NO grad op + continue; + } + + // check whether all inputs of the backward_op that ends_with @GRAD + // comes from the output of forward_op is the input of the backward_op + bool is_related_forward_node = true; + for (ir::Node* backward_input : backward_node->inputs) { + if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) { + bool meets_correct_output = false; + for (ir::Node* forward_output : node->outputs) { + if (forward_output->Name() + kGradVarSuffix == + backward_input->Name()) { + meets_correct_output = true; + break; + } + } + + if (!meets_correct_output) { + is_related_forward_node = false; + break; + } + } + } + + if (is_related_forward_node) { + return node; + } + } + } + + return nullptr; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(lock_free_optimize_pass, + paddle::framework::ir::LockFreeOptimizePass); diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..7310f596f8a3170e84840be4bab8390b780b6577 --- /dev/null +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -0,0 +1,130 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_ +#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_ + +#include +#include + +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Node; + +/* +* Remove the sum op of all gradients of the backward op. +* And remove the dependecies of the optimizer related to the +* same backward op. +* +* Before this pass: +* +* forward_op1 forward_op2 +* | | +* grad_op1 grad_op2 +* \ / +* \ / +* sum_op +* | +* sgd_op +* +* After this pass: +* forward_op1 forward_op2 +* | | +* grad_op1 grad_op2 +* | | +* sgd_op1 sgd_op2 +* +* sgd_op1 and sgd_op2 will update the same weight which holds the same +* memory, so we could benefits from the acceleration +*/ +class LockFreeOptimizePass : public Pass { + public: + virtual ~LockFreeOptimizePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + private: + // Create a new sgd node via current optimizer node + ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node, + ir::Node* backward_node, ir::Node* grad_sum_node, + ir::Node* optimize_node) const; + + // Replace the input weight's optimizers + void ReplaceUpstreamNode(ir::Node* upstream_node, + ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const; + + // Replace the output weight's optimizers + void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const; + + // Find all weight variables in graph + bool FindAllWeightVars(ir::Graph* graph) const; + + // Find the forward_op node via the backward_op node + ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph, + ir::Node* backward_node) const; + + std::vector FindConnectedNode(ir::Node* upstream_node, + ir::Node* downstream_node) const; + + inline bool IsOpNamed(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kOperation && node->Name() == name; + } + + inline bool IsVarNamed(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && node->Name() == name; + } + + inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && + boost::algorithm::ends_with(node->Name(), name); + } + + inline bool IsVarNameContains(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && + node->Name().find(name) != std::string::npos; + } + + inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const { + PADDLE_ENFORCE(ctrl_dep_node); + PADDLE_ENFORCE(node); + + return IsControlDepVar(*ctrl_dep_node) && + ctrl_dep_node->inputs.size() >= 1u && + ctrl_dep_node->inputs[0] == node; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_ diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index bd5b76426eb55cebdabfccd700439a4c418a10f0..9e77f98e9efb2c770cbce3b988914ea473a96de1 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -75,6 +75,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( std::vector optimize_ops; std::vector lr_ops; // ops other than forward/backward/optimize std::unordered_set grad_names; + std::unordered_map gradname2paramname; std::vector nodes = TopologySortOperations(*graph); auto origin_nodes = graph->ReleaseNodes(); @@ -99,6 +100,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( auto op_role_vars = boost::get>(op_role_var); for (size_t i = 0; i < op_role_vars.size(); i += 2) { grad_names.insert(op_role_vars[i + 1]); + gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i]; } } else if (op_role & static_cast(framework::OpRole::kLRSched)) { lr_ops.push_back(node); @@ -109,7 +111,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( // 2. copy forward backward ir::Node* prev_repeat_last_op_node = nullptr; - // record origin_grad -> repeated grad list map. + // record origin_grad -> repeated_grad_list map. std::map> grad_repeated_map; std::map> created; std::unordered_set bn_vars_need_rename; @@ -124,10 +126,16 @@ std::unique_ptr BatchMergePass::ApplyImpl( if (grad_names.find(outname) != grad_names.end()) { std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); repeated_op.RenameOutput(outname, new_gname); + // remove op_role_var for backward ops that outputs grad for a + // parameter. + repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + std::vector()); } } // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do - // not need this update + // not need this update, because only moving mean and variance should be + // differ, trainable parameter scale and bias is the same as other + // parameters. if (node->Name() == "batch_norm") { // NOTE: assume bn op created by layers use save var as output mean and // variance @@ -224,16 +232,25 @@ std::unique_ptr BatchMergePass::ApplyImpl( var->inputs.push_back(repeated_node); } } - } + } // end copy forward backward - // 5. create GRAD merge op node + // 5. create GRAD merge op node: sum(repeat.0...repeat.n) -> + // scale(1/num_repeats) for (auto kv : grad_repeated_map) { OpDesc sum_op; sum_op.SetType("sum"); std::vector repeated_grad_names; + std::vector param_grad_op_role_var; for (auto r : kv.second) { repeated_grad_names.push_back(r->Var()->Name()); } + // NOTE: use op_role_var to control allreduce op appending in + // multi_devices_graph_pass, we want to append op_role_var + // only once for the merged gradient, so break after first call. + param_grad_op_role_var.push_back( + gradname2paramname.at(kv.first->Var()->Name())); // param + param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad + sum_op.SetInput("X", repeated_grad_names); sum_op.SetOutput("Out", {kv.first->Var()->Name()}); sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), @@ -256,6 +273,10 @@ std::unique_ptr BatchMergePass::ApplyImpl( scale_op.SetAttr("scale", static_cast(1.0f / num_repeats)); scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kBackward)); + + scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + param_grad_op_role_var); + auto scale_op_node = result.CreateOpNode(&scale_op); scale_op_node->inputs.push_back(sum_out_var_node); sum_out_var_node->outputs.push_back(scale_op_node); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f9663bbb734d5a6eb78135aecbc3a3b..c3a044d22cf04dceecc164fae934ee15c4563af1 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f1642bc0d2b10f97295e80ee201db8f83bfd06ef..86e6b1f7d92bc7bc97180e05f6a7c14ab375f92f 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifndef PADDLE_ON_INFERENCE - LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the " - "cmake flag ON_INFER is not set."; - LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and " - "variables will be reused to save the allocation " - "overhead."; - LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by " - "setting the cmake flag ON_INFER=ON if you are " - "running Paddle Inference"; + LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the " + "cmake flag ON_INFER is not set."; + LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and " + "variables will be reused to save the allocation " + "overhead."; + LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by " + "setting the cmake flag ON_INFER=ON if you are " + "running Paddle Inference"; #endif // PADDLE_ON_INFERENCE for (auto &op : ops_) { VLOG(3) << std::this_thread::get_id() << " run " << op->Type() diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 42190b52289bfc6fc510f13cb5190a0d3e03b836..b083493ba4f4d2ea35e805333e028ed7840f9c8d 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -32,8 +32,11 @@ std::map>>)>> NgraphBridge::NG_NODE_MAP = { {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, + {"mean", paddle::operators::ngraphs::BuildMeanNode}, + {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, {"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, + {"scale", paddle::operators::ngraphs::BuildScaleNode}, {"relu", paddle::operators::ngraphs::BuildUnaryNode}, {"tanh", paddle::operators::ngraphs::BuildUnaryNode}, {"top_k", paddle::operators::ngraphs::BuildTopKNode}}; diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 23f681ce886fd0d8c113ffe4e80e25e6a803e31b..7e174c7def1ffa4089a94d9cc504b18843557c53 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() { BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; - ngraph::op::ParameterVector func_inputs; + ngraph::ParameterVector func_inputs; for (auto& vo : var_out_) { func_outputs.push_back(var_node_map_->at(vo)); @@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { } } - backend_->call(ngraph_function_, t_out, t_in); + backend_->call(backend_->compile(ngraph_function_), t_out, t_in); } // NgraphEngine::RunImpl } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 2311614c335a56501ac777d787f6653659294765..ca31303f77c4a30eb64c43404e214779ea78aeaf 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); - AddAttr>(OpCreationCallstackAttrName(), - "Callstack for Op Creatation.") - .SetDefault({}); - Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..4c59c73d8779eceb267ad532aabccabbd54b0df2 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } - static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c524b4725dfebd6ef07594b0b45c65463..2c1648c81fc999c6306d5b08bc243f3ad21fec04 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ac2828136b45159b84b2e00e5d2310e7946976f4..f10da22aec1fc03b769e61ac1be750e7df69127d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,15 +16,9 @@ limitations under the License. */ #include #include -#include -#include -#include -#include "gflags/gflags.h" -#include "glog/logging.h" #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -162,67 +156,31 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - try { - if (VLOG_IS_ON(4)) { - VLOG(4) << place << " " << DebugStringEx(&scope); - } - if (platform::is_gpu_place(place)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif - } - - // The profile has a process-wide mutex, results in serious performance - // issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. - if (platform::IsProfileEnabled()) { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); - } else { - RunImpl(scope, place); - } - - if (VLOG_IS_ON(3)) { - VLOG(3) << place << " " << DebugStringEx(&scope); - } - } catch (platform::EnforceNotMet exception) { - if (Attrs().count("sub_block") != 0) { - throw exception; - } - - auto& callstack = Attr>( - OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + } - if (callstack.empty()) { - throw exception; - } - std::ostringstream sout; - sout << "Invoke operator " << Type() << " error.\n"; - sout << "Python Callstacks: \n"; - for (auto& line : callstack) { - sout << line; - } - sout << "C++ Callstacks: \n"; - sout << exception.err_str_; - exception.err_str_ = sout.str(); - throw exception; - } catch (...) { - std::rethrow_exception(std::current_exception()); + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); } + VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { - if (inputs_.find(name) != inputs_.end()) { - return true; - } else { - return false; - } + return inputs_.find(name) != inputs_.end(); } std::string OperatorBase::Input(const std::string& name) const { @@ -421,7 +379,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var.Type().name()); + ToTypeName(var.Type())); } } @@ -432,7 +390,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -526,7 +484,7 @@ const std::vector ExecutionContext::MultiInput( PADDLE_ENFORCE( var->IsType(), "should be LoDTensor, but the received type is %s", - var->Type().name()); + ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -545,7 +503,7 @@ const std::vector ExecutionContext::LegacyMultiInput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -574,7 +532,7 @@ std::vector ExecutionContext::MultiOutput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return var->GetMutable(); }); return res; @@ -816,7 +774,7 @@ class RuntimeInferShapeContext : public InferShapeContext { PADDLE_THROW( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " "type_id is %s.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -839,7 +797,7 @@ class RuntimeInferShapeContext : public InferShapeContext { var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -1082,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - auto& scope = ctx.scope(); int data_type = -1; - std::string last_input_name; for (auto& input : this->inputs_) { - for (auto& ipt_name : input.second) { - auto* var = scope.FindVar(ipt_name); + const std::vector vars = ctx.MultiInputVar(input.first); + for (size_t i = 0; i < vars.size(); ++i) { + const Variable* var = vars[i]; if (var != nullptr) { const Tensor* t = nullptr; if (var->IsType()) { @@ -1098,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized", - ipt_name); + PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", + input.first, i); int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", - Type(), last_input_name, data_type, ipt_name, tmp); + "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", + Type(), data_type, tmp); data_type = tmp; - last_input_name = ipt_name; } } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1fe2daacf1369902cde732422b4e65c3d156250f..4d29564aeed74558b7f0ec580568f70dad0b40cc 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,20 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; +} + +inline std::string GradOriginalVarName(const std::string& grad_var_name) { + std::size_t pos = grad_var_name.rfind(kGradVarSuffix); + if (pos == std::string::npos) { + return grad_var_name; + } else { + return grad_var_name.substr(0, pos); + } } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -75,6 +90,10 @@ class RuntimeContext { RuntimeContext(const VariableNameMap& innames, const VariableNameMap& outnames, const Scope& scope); + RuntimeContext(const VariableValueMap& invars, + const VariableValueMap& outvars) + : inputs(invars), outputs(outvars) {} + VariableValueMap inputs; VariableValueMap outputs; }; @@ -110,8 +129,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } @@ -358,6 +377,30 @@ class ExecutionContext { return op_.Outputs(name); } + template + Tensor AllocateTmpTensor(const framework::DDim& dim, + const DevContext& dev_ctx) const { + auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance() + .Get(dev_ctx) + .Allocate(product(dim) * sizeof(T)); + auto& deleter = tmp_allocation_ptr.get_deleter(); + auto* allocation_ptr = tmp_allocation_ptr.release(); + auto shared_allocation = std::shared_ptr( + allocation_ptr, deleter); + + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); + PADDLE_ENFORCE_EQ(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; + } + private: const OperatorBase& op_; const Scope& scope_; @@ -441,8 +484,9 @@ class OperatorWithKernel : public OperatorBase { void RuntimeInferShape(const Scope& scope, const platform::Place& place, const RuntimeContext& ctx) const override; - protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + + protected: virtual OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const; diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index ab14732e4d6eab9dd15364da02b436c10ed68a19..fe4804ac253925c112cf7b508efc42c45868a2fa 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -288,3 +288,30 @@ TEST(OpKernel, multi_inputs) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); op->Run(scope, cpu_place); } + +TEST(VarNameTest, all) { + std::string var_name("X"); + std::string grad_var_name = paddle::framework::GradVarName(var_name); + ASSERT_EQ(grad_var_name, "X@GRAD"); + std::string original_var_name = + paddle::framework::GradOriginalVarName(grad_var_name); + ASSERT_EQ(original_var_name, "X"); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); + ASSERT_EQ(original_var_name, "X"); + + std::string var_name_2("XYZ"); + grad_var_name = paddle::framework::GradVarName(var_name_2); + ASSERT_EQ(grad_var_name, "XYZ@GRAD"); + original_var_name = paddle::framework::GradOriginalVarName(grad_var_name); + ASSERT_EQ(original_var_name, "XYZ"); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); + ASSERT_EQ(original_var_name, "XYZ"); + + std::string var_name_3(""); + grad_var_name = paddle::framework::GradVarName(var_name_3); + ASSERT_EQ(grad_var_name, "@GRAD"); + original_var_name = paddle::framework::GradOriginalVarName(grad_var_name); + ASSERT_EQ(original_var_name, ""); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); + ASSERT_EQ(original_var_name, ""); +} diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a921f469f5e0276884fe194c99b15100a11113dc..450fe1508f2a505a233b3d300cb7c500894231e7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,12 +21,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -#include "paddle/fluid/platform/nccl_helper.h" -#endif - #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" @@ -38,6 +35,8 @@ limitations under the License. */ DEFINE_string(pe_profile_fname, "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); +DEFINE_bool(enable_parallel_graph, false, + "Force disable parallel graph execution mode if set false."); namespace paddle { namespace framework { @@ -106,6 +105,7 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + size_t nranks_; // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and // then keeps unchanged @@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor( member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; + member_->nranks_ = num_trainers * places.size(); if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, @@ -224,62 +225,98 @@ ParallelExecutor::ParallelExecutor( } } + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + build_strategy.enable_parallel_graph_ = + EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); + + VLOG(1) << "Enable ParallelGraph Execution: " + << build_strategy.enable_parallel_graph_; + if (member_->use_cuda_) { // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; + // gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective + // distributed training + auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); if (nccl_id_var != nullptr) { nccl_id = nccl_id_var->GetMutable(); } + if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) { + if (nccl_id == nullptr) { + local_nccl_id_.reset(new ncclUniqueId()); + platform::dynload::ncclGetUniqueId(local_nccl_id_.get()); + nccl_id = local_nccl_id_.get(); + } + } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->places_, nccl_id, num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (build_strategy.enable_parallel_graph_) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } +#else std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->use_cuda_, member_->nccl_ctxs_.get()); -#else - std::unique_ptr graph = - build_strategy.Apply(main_program, member_->places_, loss_var_name, - member_->local_scopes_, member_->use_cuda_); + member_->nranks_, member_->use_cuda_); + graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - graph = member_->PrepareGCAndRefCnts(std::move(graph), - static_cast(max_memory_size)); + for (size_t i = 0; i < graphs.size(); ++i) { + graphs[i] = member_->PrepareGCAndRefCnts( + std::move(graphs[i]), static_cast(max_memory_size)); + } } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); + for (auto &graph : graphs) { + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } } } + // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); + size_t graph_num = ir::GraphNum(*graphs[0]); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graph) + << ir::GraphNum(*graphs[0]) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor( } } - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + if (build_strategy.enable_parallel_graph_) { + member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graph))); + std::move(graphs))); } else { - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graph))); + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } else { + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -320,6 +363,7 @@ void ParallelExecutor::BCastParamsToDevices( if (paddle::platform::is_gpu_place(main_tensor.place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::vector buffers; + buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -353,9 +397,7 @@ void ParallelExecutor::BCastParamsToDevices( #endif } else { platform::CPUPlace cpu; - for (size_t i = 0; i < member_->places_.size(); ++i) { - if (i == 0) continue; - + for (size_t i = 1; i < member_->places_.size(); ++i) { auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); @@ -424,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } } +bool ParallelExecutor::EnableParallelGraphExecution( + const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const { + if (!FLAGS_enable_parallel_graph) return false; + + bool enable_parallel_graph = true; + // TODO(Yancey1989): support sparse update in ParallelGraph mode. + for (auto &var_desc : main_program.Block(0).AllVars()) { + if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) { + enable_parallel_graph = false; + } + } + + // TODO(Yancey1989): support pserver mode + for (auto &op_desc : main_program.Block(0).AllOps()) { + if (op_desc->Type() == "send" || op_desc->Type() == "recv") { + enable_parallel_graph = false; + break; + } + } + + if (!member_->use_all_reduce_ || !member_->use_cuda_) + enable_parallel_graph = false; + + if (build_strategy.enable_sequential_execution_ || + exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) + enable_parallel_graph = false; + return enable_parallel_graph; +} + ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5f6c2159aa2d90378ac298a8e56b51a188225d45..49d3f0d3f6f2a8965d39b656071d86bde42bfd93 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -28,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + namespace paddle { namespace framework { @@ -68,8 +72,14 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; + bool EnableParallelGraphExecution(const ProgramDesc &main_program, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const; ParallelExecutorPrivate *member_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::unique_ptr local_nccl_id_; +#endif }; } // namespace framework diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dbf00f3a79f7d1dcf97b346fccfdb68f119d4aa3..f8aa87519a2fc1a14765887e95c96883d7b4589f 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -16,7 +16,9 @@ limitations under the License. */ #if !defined(_WIN32) #include -#endif // !_WIN32 +#else +#include // NOLINT +#endif // !_WIN32 #include "paddle/fluid/platform/enforce.h" @@ -29,17 +31,17 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } - void RDLock() { + inline void RDLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); } - void WRLock() { + inline void WRLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); } - void UNLock() { + inline void UNLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); } @@ -51,81 +53,46 @@ struct RWLock { // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { - void RDLock() {} - void WRLock() {} - void UNLock() {} + // FIXME(minqiyang): use mutex here to do fake lock + inline void RDLock() { mutex_.lock(); } + + inline void WRLock() { mutex_.lock(); } + + inline void UNLock() { mutex_.unlock(); } + + private: + std::mutex mutex_; }; #endif -class RWLockGuard { +class AutoWRLock { public: - enum Status { kUnLock, kWRLock, kRDLock }; - - RWLockGuard(RWLock* rw_lock, Status init_status) - : lock_(rw_lock), status_(Status::kUnLock) { - switch (init_status) { - case Status::kRDLock: { - RDLock(); - break; - } - case Status::kWRLock: { - WRLock(); - break; - } - case Status::kUnLock: { - break; - } - } - } + explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - void WRLock() { - switch (status_) { - case Status::kUnLock: { - lock_->WRLock(); - status_ = Status::kWRLock; - break; - } - case Status::kWRLock: { - break; - } - case Status::kRDLock: { - PADDLE_THROW( - "Please unlock read lock first before invoking write lock."); - break; - } - } - } + ~AutoWRLock() { UnLock(); } - void RDLock() { - switch (status_) { - case Status::kUnLock: { - lock_->RDLock(); - status_ = Status::kRDLock; - break; - } - case Status::kRDLock: { - break; - } - case Status::kWRLock: { - PADDLE_THROW( - "Please unlock write lock first before invoking read lock."); - break; - } - } - } + private: + inline void Lock() { lock_->WRLock(); } - void UnLock() { - if (status_ != Status::kUnLock) { - lock_->UNLock(); - status_ = Status::kUnLock; - } - } + inline void UnLock() { lock_->UNLock(); } + + private: + RWLock* lock_; +}; + +class AutoRDLock { + public: + explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + ~AutoRDLock() { UnLock(); } + + private: + inline void Lock() { lock_->RDLock(); } - ~RWLockGuard() { UnLock(); } + inline void UnLock() { lock_->UNLock(); } private: RWLock* lock_; - Status status_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 6fa5e99f9f3a7e871f1a742a30803853988ea6eb..953618560913229cd1e47659ad61e621efc10ed1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false, // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_LOCK_GUARD +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { @@ -67,19 +73,23 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_LOCK_GUARD - kids_.push_back(new Scope(this)); - return *kids_.back(); + Scope* child = new Scope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; } Variable* Scope::Var(const std::string& name) { - SCOPE_LOCK_GUARD + SCOPE_VARS_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_LOCK_GUARD - auto new_name = string::Sprintf("%p.%d", this, vars_.size()); + SCOPE_VARS_WRITER_LOCK + auto new_name = std::to_string(reinterpret_cast(this)) + "." + + std::to_string(vars_.size()); if (name != nullptr) { *name = new_name; } @@ -87,44 +97,46 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_LOCK_GUARD + SCOPE_KIDS_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_KIDS_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_LOCK_GUARD std::vector known_vars; - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_KIDS_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -138,8 +150,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_LOCK_GUARD std::set var_set(var_names.begin(), var_names.end()); + SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -151,12 +163,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; @@ -165,11 +177,9 @@ std::string Scope::Rename(const std::string& origin_name) const { Variable* Scope::VarInternal(const std::string& name) { auto* v = FindVarLocally(name); if (v != nullptr) return v; - v = new Variable(); - vars_[name].reset(v); + vars_.emplace(name, std::unique_ptr(v)); VLOG(3) << "Create variable " << name; - v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index aded1f771cedbf2442ad36d7fab3e6e6caffdc24..f0915d2eee072b0bcd53f37dad5ef9d801c87172 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,12 +14,18 @@ limitations under the License. */ #pragma once +extern "C" { +#include +} + #include -#include // NOLINT +#include #include #include +#include #include +#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -95,7 +101,14 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map> vars_; + struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } + }; + + mutable std::unordered_map, KeyHasher> + vars_; private: // Call Scope::NewScope for a sub-scope. @@ -124,7 +137,8 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable std::mutex mutex_; + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..5cb241a7a341d793d8450f0c9cde3929acef8965 --- /dev/null +++ b/paddle/fluid/framework/scope_pool.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace framework { + +ScopePool &ScopePool::Instance() { // NOLINT + static ScopePool pool; + return pool; +} + +void ScopePool::DeleteScope(Scope *scope) { delete scope; } + +void ScopePool::Insert(std::unique_ptr &&s) { + std::lock_guard guard(mtx_); + scopes_.insert(s.release()); +} + +void ScopePool::Remove(Scope *s) { + size_t has_scope; + { + std::lock_guard guard(mtx_); + has_scope = scopes_.erase(s); + } + PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope"); + DeleteScope(s); +} + +ScopePool::~ScopePool() { Clear(); } + +void ScopePool::Clear() { + std::lock_guard guard(mtx_); + for (auto *s : scopes_) { + DeleteScope(s); + } + scopes_.clear(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h b/paddle/fluid/framework/scope_pool.h similarity index 64% rename from paddle/fluid/framework/details/multi_devices_graph_check_pass.h rename to paddle/fluid/framework/scope_pool.h index 1e2b1867c376956d7d2dac465c13e2f3f64ba7eb..a8b468699abe148d44a395cf888158aefab4380b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h +++ b/paddle/fluid/framework/scope_pool.h @@ -14,25 +14,33 @@ #pragma once -#include "paddle/fluid/framework/details/multi_devices_helper.h" - -#include +#include // NOLINT +#include +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace framework { -namespace details { -class SSAGraghBuilderWithChecker : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return graph; - } +class ScopePool { + public: + static ScopePool &Instance(); // NOLINT + + void Insert(std::unique_ptr &&s); + + void Remove(Scope *s); + + void Clear(); + + ~ScopePool(); + + private: + ScopePool() = default; + + static void DeleteScope(Scope *scope); - bool IsValidGraph(const ir::Graph* graph) const; + std::unordered_set scopes_; + std::mutex mtx_; }; -} // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 871c7bd2a77d1cc5057177619b5cd7b2083ff308..1ffd357e62b4bdc72dbec627c463730aa2c8f720 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector* dst) { memory::Copy(dst_place, dst_ptr, boost::get(src.place()), src_ptr, size); } - -template -paddle::framework::Tensor GetTensor( - memory::allocation::AllocationPtr temp_allocation_ptr, - const framework::DDim& dim) { - auto& deleter = temp_allocation_ptr.get_deleter(); - auto* allocation_ptr = temp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE( - dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), - framework::product(dim) * sizeof(T)); - - paddle::framework::Tensor temp_tensor( - framework::ToDataType(std::type_index(typeid(T)))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index fcec955360f1c681a62929e904d5736854a8ffad..d34f826c1abb99198fd4dbe9537495edff7b63af 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() { task = std::move(tasks_.front()); tasks_.pop(); } - // run the task task(); } diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..731da74eff4d22da6730e589a1af919514f1c4b7 --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops.h @@ -0,0 +1,179 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +namespace detail { + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) { + data[kStart] = val; + UnrollFillConstant::Run(data, val); + } +}; + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) {} +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) { + d2[kStart] = static_cast(d1[kStart]); + UnrollAssign::Run(d1, d2); + } +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {} +}; + +template +struct UnrollVarArgsAssignImpl { + template + HOSTDEVICE inline static void Run(T *d, T val, Args... args) { + static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument"); + d[kStart] = val; + UnrollVarArgsAssignImpl::Run( + d, args...); + } +}; + +template +struct UnrollVarArgsAssignImpl { + HOSTDEVICE inline static void Run(T *d) {} +}; + +template +struct UnrollVarArgsAssign { + template + HOSTDEVICE inline static void Run(T *d, Args... args) { + UnrollVarArgsAssignImpl::Run( + d, args...); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline static bool Run(const T *d1, const T *d2) { + return d1[kStart] == d2[kStart] && + UnrollCompare::Run(d1, d2); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) { + return true; + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] + d2[kStart]; + UnrollAdd::Run(d1, d2, d3); + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] * d2[kStart]; + UnrollMul::Run(d1, d2, d3); + } +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline static T Run(const T *d) { + return d[kStart] * + UnrollProduct::Run(d); + } + + template + HOSTDEVICE inline static T Run(const T *d1, const T *d2) { + return d1[kStart] * d2[kStart] + + UnrollProduct::Run(d1, d2); + } +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline constexpr static T Run(const T *d) { + return 1; + } + + template + HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) { + return 0; + } +}; + +} // namespace detail + +template +using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>; + +template +using UnrollAssign = detail::UnrollAssign<0, N, N == 0>; + +template +using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; + +template +using UnrollCompare = detail::UnrollCompare<0, N, N == 0>; + +template +using UnrollAdd = detail::UnrollAdd<0, N, N == 0>; + +template +using UnrollMul = detail::UnrollMul<0, N, N == 0>; + +template +using UnrollProduct = detail::UnrollProduct<0, N, N == 0>; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..51433c83c801765d8df10590abdd319ba60e4873 --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops_test.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/unroll_array_ops.h" +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +template +bool CheckEquality(const T* p, size_t n, T val) { + return std::all_of(p, p + n, [val](const T& v) { return v == val; }); +} + +template +bool FillConstantTestMain() { + static_assert(D1 >= D2, ""); + std::array arr; + arr.fill(0); + + UnrollFillConstant::Run(arr.data(), 1); + return CheckEquality(arr.data(), D2, 1) && + CheckEquality(arr.data() + D2, arr.size() - D2, 0); +} + +TEST(unroll_ops, fill_constant) { + EXPECT_TRUE((FillConstantTestMain<9, 0>())); + EXPECT_TRUE((FillConstantTestMain<9, 1>())); + EXPECT_TRUE((FillConstantTestMain<9, 4>())); + EXPECT_TRUE((FillConstantTestMain<9, 9>())); +} + +TEST(unroll_ops, assign) { + const int a[] = {1, 2, 3, 4, 5}; + int b[] = {0, 0, 0, 0, 0}; + UnrollAssign<3>::Run(a, b); + EXPECT_EQ(b[0], 1); + EXPECT_EQ(b[1], 2); + EXPECT_EQ(b[2], 3); + EXPECT_EQ(b[3], 0); + EXPECT_EQ(b[4], 0); +} + +TEST(unroll_ops, var_args_assign) { + int a[] = {0, 0, 0}; + UnrollVarArgsAssign::Run(a, 1, 2); + EXPECT_EQ(a[0], 1); + EXPECT_EQ(a[1], 2); + EXPECT_EQ(a[2], 0); +} + +TEST(unroll_ops, compare) { + int a[] = {1, 2, 3}; + int b[] = {1, 2, 4}; + EXPECT_TRUE(UnrollCompare<2>::Run(a, b)); + EXPECT_FALSE(UnrollCompare<3>::Run(a, b)); + + b[0] = -1; + EXPECT_TRUE(UnrollCompare<0>::Run(a, b)); + EXPECT_FALSE(UnrollCompare<1>::Run(a, b)); +} + +TEST(unroll_ops, add) { + int a[] = {2, 3, 4}; + int b[] = {5, 10, 102}; + int c[] = {0, 0, 0}; + UnrollAdd<2>::Run(a, b, c); + EXPECT_EQ(a[0] + b[0], c[0]); + EXPECT_EQ(a[1] + b[1], c[1]); + EXPECT_EQ(c[2], 0); +} + +TEST(unroll_ops, mul) { + int a[] = {2, 3, 4}; + int b[] = {5, 10, 102}; + int c[] = {0, 0, 0}; + UnrollMul<2>::Run(a, b, c); + EXPECT_EQ(a[0] * b[0], c[0]); + EXPECT_EQ(a[1] * b[1], c[1]); + EXPECT_EQ(c[2], 0); +} + +TEST(unroll_ops, product) { + int a[] = {2, 3, 4}; + int b[] = {5, 10, 102}; + + EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]); + + EXPECT_EQ(UnrollProduct<3>::Run(a, b), + a[0] * b[0] + a[1] * b[1] + a[2] * b[2]); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 3b6f1cdb8f24ab20bfc80eeeba88891d7b41d1f9..73be446f71f193bea203c986b482e6b98a9826c5 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -19,52 +19,50 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { template -inline bool IsType(const std::type_index& type_index) { - return type_index == std::type_index(typeid(T)); +inline bool IsType(const std::type_index& type) { + return type == typeid(T); } -inline proto::VarType::Type ToVarType(std::type_index type) { - if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_RANK_TABLE; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR_ARRAY; - } else if (IsType(type)) { - return proto::VarType_Type_SELECTED_ROWS; - } else if (IsType(type)) { - return proto::VarType_Type_READER; - } else { - PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); +inline proto::VarType::Type ToVarType(int type) { + switch (type) { + case proto::VarType::LOD_TENSOR: + case proto::VarType::SELECTED_ROWS: + case proto::VarType::LOD_RANK_TABLE: + case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::READER: + return static_cast(type); + default: + PADDLE_THROW("ToVarType:Unsupported type %d", type); } } template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { - switch (ToVarType(var.Type())) { - case proto::VarType_Type_LOD_TENSOR: + switch (var.Type()) { + case proto::VarType::LOD_TENSOR: visitor(var.Get()); return; - case proto::VarType_Type_LOD_RANK_TABLE: + case proto::VarType::LOD_RANK_TABLE: visitor(var.Get()); return; - case proto::VarType_Type_LOD_TENSOR_ARRAY: + case proto::VarType::LOD_TENSOR_ARRAY: visitor(var.Get()); return; - case proto::VarType_Type_SELECTED_ROWS: + case proto::VarType::SELECTED_ROWS: visitor(var.Get()); return; - case proto::VarType_Type_READER: + case proto::VarType::READER: visitor(var.Get()); return; default: - PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 7842168f603885ce7dc87d2a01dfa4f544389faa..2a75394fca719196a9d53894b080598e942baa45 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) { op->InferVarType(prog.MutableBlock(0)); - ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, + ASSERT_EQ(proto::VarType::LOD_TENSOR, prog.MutableBlock(0)->Var("test2_out")->GetType()); } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc new file mode 100644 index 0000000000000000000000000000000000000000..a37b1fbab8cfd0642beaf725c02941002b2176b3 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +// Besides registering variable type id, it is helpful to register a +// var_id -> std::type_index map (for example, get type names according to id) +namespace detail { + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *id_to_type, MapType2 *type_to_id) { + using Type = + typename std::tuple_element::type; + static_assert(!std::is_same::value, "Type cannot be void"); + constexpr int kId = VarTypeTrait::kId; + auto type = std::type_index(typeid(Type)); + PADDLE_ENFORCE(id_to_type->count(kId) == 0, + "Registered duplicate type id %d for type %s", kId, + type.name()); + PADDLE_ENFORCE(type_to_id->count(type) == 0, + "Registered duplicate type_index %s for id %d", type.name(), + kId); + id_to_type->emplace(kId, type); + type_to_id->emplace(type, kId); + VarIdToTypeIndexMapInitializerImpl::Init(id_to_type, + type_to_id); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *, MapType2 *) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map and std::type_index -> var_id map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder); + + public: + static const std::type_index &ToTypeIndex(int var_id) { + auto it = Instance().id_to_type_map_.find(var_id); + PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + static int ToTypeId(const std::type_index &type) { + auto it = Instance().type_to_id_map_.find(type); + PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), + "VarType %s is not registered.", type.name()); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_); + } + + static const VarIdToTypeIndexMapHolder &Instance() { + static const VarIdToTypeIndexMapHolder instance; + return instance; + } + + std::unordered_map id_to_type_map_; + std::unordered_map type_to_id_map_; +}; + +} // namespace detail + +const std::type_index &VarTraitIdToTypeIndex(int var_id) { + return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); +} + +const char *ToTypeName(int var_id) { + return VarTraitIdToTypeIndex(var_id).name(); +} + +int TypeIndexToVarTraitId(const std::type_index &type) { + return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h new file mode 100644 index 0000000000000000000000000000000000000000..733542e4972b16a71f9e76c3076b424b7a901066 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.h @@ -0,0 +1,195 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include +#ifndef _WIN32 +#include +#endif +#endif + +// Users should add forward declarations here +namespace paddle { + +namespace platform { +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +class Communicator; +#endif +#endif +} // namespace platform + +namespace framework { +class Tensor; +class LoDTensor; +class SelectedRows; +class LoDRankTable; +class ReaderHolder; +class Scope; +} // namespace framework + +namespace operators { +template +class AlgorithmsCache; + +class CudnnRNNCache; + +namespace reader { +class LoDTensorBlockingQueueHolder; +} // namespace reader +} // namespace operators + +} // namespace paddle + +namespace paddle { +namespace framework { + +const char *ToTypeName(int var_id); +const std::type_index &VarTraitIdToTypeIndex(int var_id); +int TypeIndexToVarTraitId(const std::type_index &type); + +namespace detail { + +template +struct TypePosFinderImpl { + static constexpr int kPos = + std::is_same::value + ? kStart + : TypePosFinderImpl::kPos; +}; + +template +struct TypePosFinderImpl { + static constexpr int kPos = std::is_same::value ? kStart : -1; +}; + +// TypePosFinder helps to find the position in which T is inside Args... +// If T is not inside Args..., kPos would be -1 +template +struct TypePosFinder { + static constexpr int kPos = + TypePosFinderImpl::kPos; +}; + +template +struct VarTypeRegistryImpl { + static constexpr size_t kRegisteredTypeNum = sizeof...(Args); + using ArgTuple = std::tuple; + + // TypePos() returns the position in which T is inside Args... + // If T is not inside Args..., return -1 + template + static constexpr int TypePos() { + return TypePosFinder::kPos; + } + + // IsRegistered() returns whether T is registered inside RegistryImpl + template + static constexpr bool IsRegistered() { + return TypePos() >= 0; + } +}; + +} // namespace detail + +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = static_cast(proto_id); \ + } + +/** + * The following codes are designed to register variable types. + * Only registered types can be stored in Variable. + * This registry mechanism is designed to speed up Variable. + * + * Caution: If you want to add more var types, please consider carefully + * whether you really need to add it. + */ + +// Users should add other variable types below. +// Paddle would generate unique Ids for each registered variable types. +using VarTypeRegistry = detail::VarTypeRegistryImpl< + Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, + LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, + std::map, operators::reader::LoDTensorBlockingQueueHolder, +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 + ncclUniqueId, platform::Communicator, +#endif + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::CudnnRNNCache, +#endif + int, float>; + +template +struct VarTypeTrait { + static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); + using Type = T; + /** + * Unique VarType Id generation. + * + * The auto-generated id should not be the same as any protobuf id defined in + * framework.proto. Therefore, we generate id by adding the type pos and + * maximum protobuf id (i.e., proto::VarType::TUPLE). + * + * However, we may need more protobuf id in the future. + * To avoid changing this auto id generation algorithm frequently, we + * generate id by adding the type pos and twice of maximum protobuf id (i.e., + * proto::VarType::TUPLE). + */ + static constexpr int kId = VarTypeRegistry::TypePos() + + static_cast(proto::VarType::TUPLE) * 2; +}; + +// Users should set some of variable type ids to be what is defined in +// framework.proto below +REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); +REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); +REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); +REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); +REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); +REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); +REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); + +/** End of variable type registration */ + +template +inline constexpr bool IsRegisteredVarType() { + return VarTypeRegistry::IsRegistered(); +} + +#undef REG_PROTO_VAR_TYPE_TRAIT +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a47275e1ca25a4f66e67b4986ec78e49ea952a51 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *var_id_set, SetType2 *type_index_set) { + using Type = + typename std::tuple_element::type; + static_assert(std::is_same::Type, Type>::value, + "Type must be the same"); + constexpr auto kId = VarTypeTrait::kId; + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type); + EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId); + EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)), + actual_type); + EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId); + + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); + TypeIndexChecker::Check(var_id_set, + type_index_set); + } +}; + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *, SetType2 *) {} +}; + +TEST(var_type_traits, check_no_duplicate_registry) { + constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; + std::unordered_set var_id_set; + std::unordered_set type_index_set; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check( + &var_id_set, &type_index_set); +} + +template +bool CheckVarId(int proto_id) { + static_assert(std::is_same::Type, T>::value, + "Type must be the same"); + return VarTypeTrait::kId == proto_id; +} + +TEST(var_type_traits, check_proto_type_id) { + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); + ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); + ASSERT_TRUE(CheckVarId(proto::VarType::READER)); + ASSERT_TRUE(CheckVarId(proto::VarType::INT32)); + ASSERT_TRUE(CheckVarId(proto::VarType::FP32)); + + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR); + ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS); + ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES); + ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE); + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY, + proto::VarType::LOD_TENSOR_ARRAY); + ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST); + ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER); + ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH); + ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST); + ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW); + ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE); + ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32); + ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32); +} + +TEST(var_type_traits, test_registry) { + using Registry = detail::VarTypeRegistryImpl; + ASSERT_TRUE(Registry::TypePos() == 0); + ASSERT_TRUE(Registry::TypePos() == 1); + ASSERT_TRUE(Registry::TypePos() == 2); + ASSERT_TRUE(Registry::TypePos() == 3); + ASSERT_TRUE(Registry::TypePos() == -1); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 873e1b20a584df3ba90cf5c1a62a3879bf98ce5c..b9d07da822cf1eb42859e1d7d84437582fada8ff 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/var_type_traits.h" namespace paddle { namespace framework { @@ -27,10 +27,14 @@ class Variable { public: template const T& Get() const { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); return *static_cast(holder_->Ptr()); } @@ -39,61 +43,61 @@ class Variable { template T* GetMutable() { if (!holder_) { - holder_.reset(new PlaceholderImpl(new T())); + holder_.reset(new PlaceholderImpl()); } else { - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); } return static_cast(holder_->Ptr()); } template bool IsType() const { - return holder_ != nullptr && - std::type_index(typeid(T)) == std::type_index(holder_->Type()); + return holder_ && holder_->Type() == VarTypeTrait::kId; } void Clear() { holder_.reset(); } - std::type_index Type() const { + int Type() const { PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); return holder_->Type(); } private: struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_info& Type() const = 0; - virtual void* Ptr() const = 0; + virtual ~Placeholder() = default; + + inline int Type() const { return type_; } + inline const void* Ptr() const { return ptr_; } + inline void* Ptr() { return ptr_; } + + protected: + inline void Init(void* p, int type) { + ptr_ = p; + type_ = type; + } + + void* ptr_; + int type_; }; // Placeholder hides type T, so it doesn't appear as a template // parameter of Variable. template struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} - - virtual const std::type_info& Type() const { return type_; } - virtual void* Ptr() const { return static_cast(ptr_.get()); } + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PlaceholderImpl() { this->Init(&obj_, VarTypeTrait::kId); } - std::unique_ptr ptr_; - const std::type_info& type_; + private: + T obj_; }; - std::unique_ptr - holder_; // pointers to a PlaceholderImpl object indeed. - - // name_ is only meaningful with a Scope and accessible by it. - // - // NOTE: Please don't expose name_ by adding methods like - // Variable::Name or Scope::VarName! A variable could have a human - // readable name or an auto-generated scope-unique name. In the - // former case, the caller knows the name and doesn't need to access - // the name; in the latter case, the variable should be identified - // by its address but not the unreadable name. - friend class Scope; - const std::string* name_; + // pointers to a PlaceholderImpl object indeed. + std::unique_ptr holder_; }; } // namespace framework diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index 003dcfd3dfe5ecfd563a686bb72b061aff602f73..511c9c52146ece4b90905cc9d49565103589c1ec 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -16,27 +16,28 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -TEST(Variable, GetMutable) { - using paddle::framework::Variable; - - struct Tensor { - int content_; - }; +namespace paddle { +namespace framework { +TEST(Variable, GetMutable) { std::unique_ptr v(new Variable()); - Tensor* t = v->GetMutable(); - t->content_ = 1234; + auto* t = v->GetMutable(); + *t = "1234"; - const Tensor& tt = v->Get(); - EXPECT_EQ(1234, tt.content_); + const auto& tt = v->Get(); + EXPECT_EQ("1234", tt); try { - v->GetMutable(); + v->GetMutable(); } catch (std::exception& e) { return; } EXPECT_TRUE(false); } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 342cb68ab2bf8ceb543317ed8d8f2356ef6b2cde..9813149865489f7c10c0f4942172d46e1bccf81f 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -31,8 +32,14 @@ using framework::Variable; void AddTo(Variable* src, Variable* dst) { framework::LoDTensor* dst_tensor = dst->GetMutable(); framework::LoDTensor* src_tensor = src->GetMutable(); - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", - dst_tensor->numel(), src_tensor->numel()); + // FIXME(minqiyang): loss_grad op will pass a zero grad of label + // ugly fix for it + if (src_tensor->numel() == 0) { + return; + } + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), + "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), + src_tensor->numel()); float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); const float* src_data = src_tensor->data(); for (size_t i = 0; i < src_tensor->numel(); ++i) { @@ -42,12 +49,12 @@ void AddTo(Variable* src, Variable* dst) { class Autograd { public: - explicit Autograd(framework::Scope* scope) : scope_(scope) {} + Autograd() {} void RunBackward(VarBase* var) { - PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create for vars that "require_grad" - (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; + if (var->stop_gradient_) { + return; + } std::deque ready; ready.push_back(var->pre_op_); @@ -57,18 +64,25 @@ class Autograd { while (!ready.empty()) { OpBase* ready_op = ready.front(); ready.pop_front(); - std::vector input_grads = ready_op->ApplyGrad(scope_); - - for (size_t i = 0; i < input_grads.size(); ++i) { - if (!input_grads[i]) continue; - OpBase* pre_op = ready_op->pre_ops_->at(i); - if (!pre_op) continue; - - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); + std::map> input_grads = + ready_op->ApplyGrad(); + + for (auto it : input_grads) { + const std::vector& ingrads = it.second; + for (size_t i = 0; i < ingrads.size(); ++i) { + if (!ingrads[i]) continue; + if (ready_op->input_vars_[it.first][i]->stop_gradient_) { + continue; + } + OpBase* pre_op = ready_op->pre_ops_[it.first][i]; + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } } } } @@ -85,138 +99,88 @@ class Autograd { while (!queue.empty()) { OpBase* candidate = queue.front(); queue.pop_front(); - for (OpBase* pre_op : *(candidate->pre_ops_)) { - if (!pre_op) continue; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); + for (auto it : candidate->pre_ops_) { + for (OpBase* pre_op : it.second) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; } - ret[pre_op] += 1; } } - return ret; } - - framework::Scope* scope_; }; -framework::Variable* CreateVariable(const std::string& name, - const framework::DDim& dim, float val, - framework::Scope* scope, - bool random_name = true) { - std::string varname = name; - if (random_name) { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - int id = dist6(rng); - varname = string::Sprintf("%s@%d", varname, id); - } - - VLOG(3) << "creating var " << varname; - framework::Variable* var = scope->Var(varname); - framework::LoDTensor* tensor = var->GetMutable(); - - float* data = tensor->mutable_data(dim, platform::CPUPlace()); - std::fill(data, data + tensor->numel(), val); - return var; -} - framework::LoDTensor& VarBase::Grad() { VLOG(3) << "get var grad " << var_desc_->Name(); return *grads_->GetMutable(); } -void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { - VLOG(3) << "apply var grad " << var_desc_->Name() << " " - << grad->Get().data()[0]; - if (!grads_) { - grads_ = - CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), - var_->Get().dims(), 0.0, scope); +std::map> OpBase::ApplyGrad() { + if (!grad_op_desc_) { + LOG(WARNING) << "op with no grad: " << op_desc_->Type(); + return {}; } - AddTo(grad, grads_); - VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " - << grads_->Get().data()[0]; -} - -std::vector OpBase::ApplyGrad(framework::Scope* scope) { VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { - if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { - // grad op inputs can be forward inputs, so not in grad_to_var. - continue; - } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); - framework::Variable* var = scope->Var(grad_invar); - const std::string& invar = grad_to_var_->at(grad_invar); - for (VarBase* varbase : *output_vars_) { - // Use the accumulated grads_ by sharing the input with grads_. - if (varbase->var_desc_->Name() == invar) { - var->GetMutable()->ShareDataWith( - varbase->grads_->Get()); - break; - } + std::vector> tmp_vars; + std::map> grad_outputs; + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + + tmp_vars.emplace_back(tmp_var); + outputs.push_back(tmp_var); } } - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; - block_->FindRecursiveOrCreateVar(outvar); - framework::Variable* var = scope->Var(outvar); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(outvar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - } - grad_op_desc_->InferShape(*block_); + framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc_); - - opbase->Run(*scope, platform::CPUPlace()); - - // `ret` matches exactly with `input_vars_` of forward op. - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - bool found = false; - VarBase* origin_var = (*input_vars_)[i]; - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - Variable* var = scope->FindVar(outvar); - std::string orig_var = grad_to_var_->at(outvar); - if (origin_var->var_desc_->Name() != orig_var) { - continue; - } - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; - origin_var->ApplyGrad(scope, var); - found = true; - ret.push_back(var); - // TODO(panyx0718): There might be another outvar with the same name. - // In that case, it doesn't matter the first one or the second one is - // used. - break; - } - if (!found) { - ret.push_back(nullptr); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + auto& origin_outputs = it.second; + + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(outputs[i], orig_grad); } } - return ret; + return input_vars_; } -void VarBase::RunBackward(framework::Scope* scope) { - grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), - var_->Get().dims(), 1.0, scope, - false); +void VarBase::RunBackward() { if (!pre_op_) return; - Autograd(scope).RunBackward(this); + + auto grads_t = grads_->GetMutable(); + float* data = grads_t->mutable_data(platform::CPUPlace()); + std::fill(data, data + grads_t->numel(), 1.0); + + PADDLE_ENFORCE( + grads_ == + pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_); + Autograd().RunBackward(this); } } // namespace imperative diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 85a71ca83d21ed2595ddbe684300a46c05fed3af..2abda933cfc983bcef2433c3be4681c51e78ff1c 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -14,17 +14,69 @@ #pragma once +#include #include #include #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace imperative { +class PreparedOp { + public: + PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx) + : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {} + + static PreparedOp Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + const platform::Place& place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = op.AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op.Type()); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + op.Type()); + } + + framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; + + auto expected_kernel_key = op.GetExpectedKernelType( + framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_MKLDNN + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = framework::LibraryType::kPlain; + expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } +#endif + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", op.Type(), + KernelTypeToString(expected_kernel_key)); + } + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); + } + + const framework::OperatorBase& op; + const framework::RuntimeContext& ctx; + framework::OperatorWithKernel::OpKernelFunc func; + platform::DeviceContext* dev_ctx; +}; class OpBase; class VarBase { @@ -33,56 +85,62 @@ class VarBase { : pre_op_(nullptr), pre_op_out_idx_(-1), var_desc_(nullptr), - var_(nullptr), - grads_(nullptr) {} + var_(new framework::Variable()), + grads_(new framework::Variable()), + stop_gradient_(false) {} - virtual ~VarBase() {} + explicit VarBase(bool stop_gradient) + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(new framework::Variable()), + grads_(new framework::Variable()), + stop_gradient_(stop_gradient) {} - void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + virtual ~VarBase() {} - void RunBackward(framework::Scope* scope); + void RunBackward(); framework::LoDTensor& Grad(); + inline std::string GradName() const { + PADDLE_ENFORCE( + var_desc_, + "Couldn't get gradient variable's name, please call backward() first"); + return string::Sprintf("%s@IGrad", var_desc_->Name()); + } + OpBase* pre_op_; + std::string pre_op_out_name_; int pre_op_out_idx_; framework::VarDesc* var_desc_; framework::Variable* var_; framework::Variable* grads_; + + bool stop_gradient_; }; class OpBase { public: - OpBase() - : input_vars_(new std::vector()), - output_vars_(new std::vector()), - pre_ops_(new std::vector()), - pre_ops_out_idx_(new std::vector()), - op_desc_(nullptr), - grad_op_desc_(nullptr) {} + OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {} virtual ~OpBase() { - delete input_vars_; - delete output_vars_; - - delete pre_ops_; - delete pre_ops_out_idx_; - if (grad_op_desc_) delete grad_op_desc_; - if (grad_to_var_) delete grad_to_var_; } - std::vector ApplyGrad(framework::Scope* scope); + std::map> ApplyGrad(); - std::vector* input_vars_; - std::vector* output_vars_; - std::vector* pre_ops_; - std::vector* pre_ops_out_idx_; framework::OpDesc* op_desc_; - framework::OpDesc* grad_op_desc_; - std::unordered_map* grad_to_var_; + + std::map> input_vars_; + std::map> output_vars_; + std::map> pre_ops_; + std::map> pre_ops_out_idx_; + + std::map> grad_input_vars_; + std::map> grad_output_vars_; framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 97772dc110135d9d2533e1574933d49f7c8cd346..c6eff86fac7be04821e1f8718ad06356fd6adb48 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -20,7 +20,6 @@ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" @@ -41,22 +40,26 @@ void CreateGradOp(const framework::OpDesc& op_desc, *grad_op_desc = grad_op_descs[0].release(); } +void InitVar(framework::Variable* var, framework::Variable* grad_var) { + auto& var_t = var->Get(); + float* data = + grad_var->GetMutable()->mutable_data( + var_t.dims(), platform::CPUPlace()); + std::fill(data, data + var_t.numel(), 0.0); +} + class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block, - framework::BlockDesc* startup_block) - : root_block_(root_block), startup_block_(startup_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - scopes_[startup_block_] = root_scope_; - } + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} - virtual ~Tracer() { delete root_scope_; } + virtual ~Tracer() {} + + void Trace(OpBase* op, + const std::map>& inputs, + const std::map>& outputs, + framework::BlockDesc* block, const bool stop_gradient = false) { + std::map vars; - void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, - framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); framework::OpDesc* op_desc = op->op_desc_; VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); @@ -64,77 +67,113 @@ class Tracer { std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); - *op->input_vars_ = inputs; - for (VarBase* input : inputs) { - const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - input->var_ = var; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); + framework::VariableValueMap invars_map; + framework::VariableValueMap outvars_map; + + op->input_vars_ = inputs; + for (auto it : op->input_vars_) { + auto& invars = invars_map[it.first]; + for (VarBase* inp : it.second) { + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", + op->op_desc_->Type(), inp->var_desc_->Name()); + + invars.push_back(inp->var_); + vars[inp->var_desc_->Name()] = inp; + if (inp->pre_op_) { + op->pre_ops_[it.first].push_back(inp->pre_op_); + op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_); } else { - LOG(ERROR) << "tracer doesn't support yet"; + op->pre_ops_[it.first].push_back(nullptr); } + VLOG(3) << "input vname " << inp->var_desc_->Name() << " " + << inp->var_->IsInitialized(); } - if (input->pre_op_) { - op->pre_ops_->push_back(input->pre_op_); - op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); - } else { - op->pre_ops_->push_back(nullptr); - } - VLOG(3) << "input vname " << vname << " " - << var->Get().dims().size(); } - *op->output_vars_ = outputs; - for (size_t i = 0; i < outputs.size(); ++i) { - const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); + op->output_vars_ = outputs; + for (auto it : op->output_vars_) { + auto& outvars = outvars_map[it.first]; + const std::vector& outputs = it.second; + for (size_t i = 0; i < outputs.size(); ++i) { + VarBase* out = outputs[i]; + outvars.push_back(out->var_); + vars[out->var_desc_->Name()] = out; + + framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); + out->var_->GetMutable(); } else { LOG(ERROR) << "tracer doesn't support yet"; } + out->stop_gradient_ = stop_gradient; + out->pre_op_ = op; + out->pre_op_out_name_ = it.first; + out->pre_op_out_idx_ = i; + + VLOG(3) << "output vname " << out->var_desc_->Name() << " " + << out->var_->IsInitialized(); } - outputs[i]->var_ = var; - outputs[i]->pre_op_ = op; - outputs[i]->pre_op_out_idx_ = i; } VLOG(3) << "tracer running " << op_desc->Type(); - op_base->Run(*scope, platform::CPUPlace()); - if (block == startup_block_) { - op->grad_op_desc_ = nullptr; - op->grad_to_var_ = nullptr; - } else { + framework::RuntimeContext ctx(invars_map, outvars_map); + + // TODO(panyx0718): Cache p. + framework::OperatorWithKernel* op_kernel = + dynamic_cast(op_base.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + + if (!stop_gradient) { framework::OpDesc* grad_op_desc; auto grad_to_var = new std::unordered_map(); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; - } - op->block_ = block; - } - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->IsInitialized()) { + InitVar(var->var_, var->grads_); + } + grad_in_vars.push_back(var->grads_); + } + } + } + + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end()); + VarBase* var = vars[var_it->second]; + if (!var->grads_->IsInitialized()) { + InitVar(var->var_, var->grads_); + } + grad_out_vars.push_back(var->grads_); + } + } } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; + + op->block_ = block; } private: - std::map scopes_; framework::BlockDesc* root_block_; - framework::BlockDesc* startup_block_; - framework::Scope* root_scope_; }; } // namespace imperative diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cb88333d1570322fbac7112755bab5e11c97201a..4c84d02d8679c4d42c0d02ae83e7f869c0f5ce8b 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -69,19 +69,19 @@ void TestWord2vecPrediction(const std::string& model_path) { std::vector outputs; CHECK(predictor->Run(slots, &outputs)); - PADDLE_ENFORCE(outputs.size(), 1UL); + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); // Check the output buffer size and result of each tid. - PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL); float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); i++) { - LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; - PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], - result[i]); + LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] + << " result: " << result[i]; + EXPECT_NEAR(static_cast(outputs.front().data.data())[i], result[i], + 1e-3); } } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 83d411eecf6d706615243fd78cb7e4330d904fc1..2d8980b1d15d89cdf9c243a57188a0acb354940d 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -123,10 +123,9 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); - DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller, - std::function); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 51bca8039d4531536cd7a3c39ef8a27f1a5412a1..e37fea38bcb2b1f514347ecbfe7072abb6f07455 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument, for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); - // Set some pass attributes. - if (pass_name == "ir_analysis_pass") { - pass->Set("tensorrt_node_teller", - new SubgraphDetector::NodeInsideSubgraphTeller( - argument->tensorrt_node_teller())); - } - if (pass_name == "graph_viz_pass") { std::string dot_file_path = std::to_string(pass_num) + "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) + @@ -70,11 +63,10 @@ void IRPassManager::CreatePasses(Argument *argument, } if (pass_name == "tensorrt_subgraph_pass") { - PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); - pass->SetNotOwned("tensorrt_node_teller", - argument->tensorrt_node_teller_ptr()); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); + pass->Set("min_subgraph_size", + new int(argument->tensorrt_min_subgraph_size())); } // graph_ = pass->Apply(std::move(graph_)); diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 822c7799bb3ae6d79da6cf2a7b3c8c9b20353ed7..9ae5b8aa173b85904df360eb196aefe5af08c6aa 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,9 +1,13 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) -cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector) -set(analysis_deps ${analysis_deps} - subgraph_detector tensorrt_subgraph_pass - CACHE INTERNAL "") -set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) -file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") -set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +if (TENSORRT_FOUND) + cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) + + set(analysis_deps ${analysis_deps} + subgraph_detector tensorrt_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 9c42b83e7add348433635b1899087324e4e370d4..bc06e78ae6997b0d4d0456c15d6e4158efdad300 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/op_teller.h" namespace paddle { namespace inference { @@ -33,10 +36,13 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); - auto teller = - Get("tensorrt_node_teller"); + auto teller = [](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); + }; - SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/); + SubGraphFuser fuser(graph.get(), teller, + Get("min_subgraph_size") /*min subgraph size*/); fuser(); for (auto *node : graph->Nodes()) { @@ -197,10 +203,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, std::vector ExtractParameters( const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + std::vector parameters; for (const auto &node : nodes) { if (!node->IsVar()) continue; - if (node->Var()->Persistable()) { + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { parameters.push_back(node->Name()); } } @@ -213,6 +235,6 @@ std::vector ExtractParameters( REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) - .RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("max_batch_size") - .RequirePassAttr("workspace_size"); + .RequirePassAttr("workspace_size") + .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index d3ea511d8f4d8cbec1be57633391f00e29a3e6e9..add9b70f2cd960a94232b35edb928ab4115cbff0 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps} ir_graph_build_pass ir_analysis_pass analysis_passes + subgraph_detector CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index c3a2b3ca1d3b09e71921fde0b0bad8d195aaa38f..490189e550760b4de62724e685dd07f6e521445e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -27,9 +27,6 @@ namespace analysis { void IrAnalysisComposePass::RunImpl(Argument *argument) { ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); - if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { - InitTensorRTAttrs(argument); - } ApplyIrPasses(argument); CollectFusionStatis(argument); } @@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const { return "ir-analysis-compose-pass"; } -void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { - if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { - LOG(INFO) << "Initing TensorRT pass"; - argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) { - std::unordered_set teller_set( - {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", - "conv2d_transpose", "leaky_relu"}); - if (!node->IsOp()) return false; - - if (teller_set.count(node->Op()->Type())) { - return true; - } else { - return false; - } - }); - } -} - void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { std::vector passes({ "ir_graph_build_pass", "ir_analysis_pass", diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h index 53e2ebb0038a5c105f68a0146b3da90a6ae34af8..16c6b7d84df88d0ebbc06b547c75a45dcb0c2440 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h @@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass { std::string repr() const override; private: - void InitTensorRTAttrs(Argument* argument); - void ApplyIrPasses(Argument* argument); void CollectFusionStatis(Argument* argument); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index dcefdd92f5157dce7426f2f3e4a2bc053ce24775..211c691504de2c0bd8ff50f34b92cbc01397d5c9 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -14,84 +14,101 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle_pass_builder.h" // NOLINT +#include "paddle/fluid/platform/gpu_info.h" namespace paddle { PassStrategy *contrib::AnalysisConfig::pass_builder() const { - PADDLE_ENFORCE( - pass_builder_.get(), - "Should call constructor first, that will init the pass_builder_."); + if (!pass_builder_.get()) { + if (use_gpu_) { + LOG(INFO) << "Create GPU IR passes"; + pass_builder_.reset(new GpuPassStrategy); + } else { + LOG(INFO) << "Create CPU IR passes"; + pass_builder_.reset(new CpuPassStrategy); + } + } else if (pass_builder_->use_gpu() ^ use_gpu()) { + LOG(WARNING) << "The use_gpu flag is not compatible between Config and " + "PassBuilder, the flags are " + << use_gpu() << " " << pass_builder_->use_gpu(); + LOG(WARNING) << "Please make them compatible, still use the existing " + "PassBuilder."; + } + return pass_builder_.get(); } -contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) { - this->use_gpu = use_gpu; - if (use_gpu) { - pass_builder_.reset(new GpuPassStrategy); - } else { - pass_builder_.reset(new CpuPassStrategy); - } +contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { + model_dir_ = model_dir; +} +contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, + const std::string ¶ms_file) { + prog_file_ = prog_file; + params_file_ = params_file; +} +void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, + const std::string ¶ms_file_path) { + prog_file_ = prog_file_path; + params_file_ = params_file_path; +} +void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, + int device_id) { +#ifdef PADDLE_WITH_CUDA + use_gpu_ = true; + memory_pool_init_size_mb_ = memory_pool_init_size_mb; + device_id_ = device_id; +#else + LOG(ERROR) << "Please compile with gpu to EnableGpu"; + use_gpu_ = false; +#endif } +void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; } contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { - // fields from Config - model_dir = other.model_dir; - // fields from NativeConfig - use_gpu = other.use_gpu; - device = other.device; - fraction_of_gpu_memory = other.fraction_of_gpu_memory; - prog_file = other.prog_file; - param_file = other.param_file; - specify_input_name = other.specify_input_name; - cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; - // fields from this. - enable_ir_optim = other.enable_ir_optim; - // For mkldnn - use_mkldnn_ = other.use_mkldnn_; - mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; - - use_feed_fetch_ops = other.use_feed_fetch_ops; - use_tensorrt_ = other.use_tensorrt_; - tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; - tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - model_from_memory_ = other.model_from_memory_; - - if (use_gpu) { +#define CP_MEMBER(member__) member__ = other.member__; + + // Model related. + CP_MEMBER(model_dir_); + CP_MEMBER(prog_file_); + CP_MEMBER(params_file_); + CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and + // params_file_ fields. + // Gpu releated. + CP_MEMBER(use_gpu_); + CP_MEMBER(device_id_); + CP_MEMBER(memory_pool_init_size_mb_); + // TensorRT releated. + CP_MEMBER(use_tensorrt_); + CP_MEMBER(tensorrt_workspace_size_); + CP_MEMBER(tensorrt_max_batchsize_); + CP_MEMBER(tensorrt_min_subgraph_size_); + // MKLDNN releated. + CP_MEMBER(use_mkldnn_); + CP_MEMBER(mkldnn_enabled_op_types_); + + // Ir related. + CP_MEMBER(enable_ir_optim_); + CP_MEMBER(use_feed_fetch_ops_); + CP_MEMBER(ir_debug_); + CP_MEMBER(specify_input_name_); + + CP_MEMBER(cpu_math_library_num_threads_); + + CP_MEMBER(serialized_info_cache_); + + if (use_gpu_) { pass_builder_.reset(new GpuPassStrategy( *static_cast(other.pass_builder()))); } else { pass_builder_.reset(new CpuPassStrategy( *static_cast(other.pass_builder()))); } -} -contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { - // fields from Config - model_dir = other.model_dir; - // fields from NativeConfig - use_gpu = other.use_gpu; - device = other.device; - fraction_of_gpu_memory = other.fraction_of_gpu_memory; - prog_file = other.prog_file; - param_file = other.param_file; - specify_input_name = other.specify_input_name; - cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; - // fields from this. - enable_ir_optim = other.enable_ir_optim; - // For mkldnn - use_mkldnn_ = other.use_mkldnn_; - mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; - - use_feed_fetch_ops = other.use_feed_fetch_ops; - use_tensorrt_ = other.use_tensorrt_; - tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; - tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - model_from_memory_ = other.model_from_memory_; - - pass_builder_ = std::move(other.pass_builder_); +#undef CP_MEMBER } void contrib::AnalysisConfig::EnableMKLDNN() { @@ -105,20 +122,95 @@ void contrib::AnalysisConfig::EnableMKLDNN() { } void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size) { + int max_batch_size, + int min_subgraph_size) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; - // Append after the infer_clean pass. - pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); +} + +void contrib::AnalysisConfig::Update() { + auto info = SerializeInfoCache(); + if (info == serialized_info_cache_) return; + + if (use_gpu_) { + pass_builder_.reset(new GpuPassStrategy); + } else { + pass_builder_.reset(new CpuPassStrategy); + } + + if (use_tensorrt_) { + if (!use_gpu_) { + LOG(ERROR) + << "TensorRT engine is not available when EnableGpu() not actived."; + } else { + // Append after the infer_clean pass. + pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + } + } + + if (use_mkldnn_) { + if (!enable_ir_optim_) { + LOG(ERROR) + << "EnableMKLDNN() only works when IR optimization is enabled."; + } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMKLDNN(); + use_mkldnn_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; + use_mkldnn_ = false; +#endif + } + + if (ir_debug_) { + pass_builder()->TurnOnDebug(); + } +} + +std::string contrib::AnalysisConfig::SerializeInfoCache() { + std::stringstream ss; + ss << use_gpu_; + ss << memory_pool_init_size_mb_; + + ss << use_tensorrt_; + ss << tensorrt_workspace_size_; + ss << tensorrt_max_batchsize_; + + ss << use_mkldnn_; + ss << enable_ir_optim_; + ss << use_feed_fetch_ops_; + ss << ir_debug_; + + return ss.str(); +} + +void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( + int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; +} + +float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { +#ifdef PADDLE_WITH_CUDA + // Get the GPU memory details and calculate the fraction of memory for the + // GPU memory pool. + size_t gpu_used, gpu_available; + platform::GpuMemoryUsage(&gpu_used, &gpu_available); + double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.; + float fraction_of_gpu_memory = + static_cast(memory_pool_init_size_mb()) / total_gpu_memory; + return fraction_of_gpu_memory; +#else + return 0.; +#endif } void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, size_t param_buffer_size) { - prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); - param_file = std::string(param_buffer, param_buffer + param_buffer_size); + prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); + params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3937884ce4a5a16a1093ac8977033eaa98b2678e..585634fae9c85f77cc77d774ac166891014a025c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -33,6 +33,7 @@ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); @@ -59,8 +60,8 @@ bool AnalysisPredictor::Init( if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; - auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll - : platform::ProfilerState::kCPU; + auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } @@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram( // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. - if (config_.enable_ir_optim) { + if (config_.ir_optim()) { status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { @@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram( return true; } bool AnalysisPredictor::CreateExecutor() { - if (config_.use_gpu) { + if (config_.use_gpu_) { status_use_gpu_ = true; - place_ = paddle::platform::CUDAPlace(config_.device); + place_ = paddle::platform::CUDAPlace(config_.device_id_); } else { place_ = paddle::platform::CPUPlace(); } @@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() { } bool AnalysisPredictor::PrepareExecutor() { executor_->Prepare(sub_scope_, *inference_program_, 0, - config_.use_feed_fetch_ops); + config_.use_feed_fetch_ops_); PADDLE_ENFORCE_NOT_NULL(sub_scope_); @@ -250,8 +251,13 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, } input.set_lod(lod); int idx = -1; - if (config_.specify_input_name) { - idx = feed_names_[inputs[i].name]; + if (config_.specify_input_name_) { + auto name = inputs[i].name; + if (feed_names_.find(name) == feed_names_.end()) { + LOG(ERROR) << "feed names from program do not have name: [" << name + << "] from specified input"; + } + idx = feed_names_[name]; } else { idx = boost::get(feeds_[i]->GetAttr("col")); } @@ -309,25 +315,26 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, void AnalysisPredictor::OptimizeInferenceProgram() { status_program_optimized_ = true; - argument_.SetUseGPU(config_.use_gpu); - argument_.SetGPUDeviceId(config_.device); + argument_.SetUseGPU(config_.use_gpu()); + argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program - if (!config_.model_dir.empty()) { - argument_.SetModelDir(config_.model_dir); + if (!config_.model_dir().empty()) { + argument_.SetModelDir(config_.model_dir()); } else { PADDLE_ENFORCE( - !config_.param_file.empty(), + !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); - PADDLE_ENFORCE(!config_.prog_file.empty()); - argument_.SetModelProgramPath(config_.prog_file); - argument_.SetModelParamsPath(config_.param_file); + PADDLE_ENFORCE(!config_.prog_file().empty()); + argument_.SetModelProgramPath(config_.prog_file()); + argument_.SetModelParamsPath(config_.params_file()); } - if (config_.use_gpu && config_.use_tensorrt_) { + if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); + argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); } if (config_.use_mkldnn_) { @@ -335,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } auto passes = config_.pass_builder()->AllPasses(); - if (!config_.enable_ir_optim) passes.clear(); + if (!config_.ir_optim()) passes.clear(); argument_.SetIrAnalysisPasses(passes); argument_.SetScopeNotOwned(const_cast(scope_.get())); Analyzer().Run(&argument_); @@ -352,18 +359,26 @@ template <> std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; - if (config.use_gpu) { + if (config.use_gpu()) { // 1. GPU memeroy - PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, 0.f, - "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); - PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); + PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); + PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", + config.gpu_device_id()); std::vector flags; - if (config.fraction_of_gpu_memory >= 0.0f || - config.fraction_of_gpu_memory <= 0.95f) { + + float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); + if (fraction_of_gpu_memory > 0.95f) { + LOG(ERROR) + << "Allocate too much memory for the GPU memory pool, assigned " + << config.memory_pool_init_size_mb() << " MB"; + LOG(ERROR) + << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; + } + + if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { flags.push_back("dummpy"); std::string flag = "--fraction_of_gpu_memory_to_use=" + - std::to_string(config.fraction_of_gpu_memory); + std::to_string(fraction_of_gpu_memory); flags.push_back(flag); VLOG(3) << "set flag: " << flag; framework::InitGflags(flags); @@ -437,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() { bool AnalysisPredictor::LoadProgramDesc() { // Initialize the inference program std::string filename; - if (!config_.model_dir.empty()) { - filename = config_.model_dir + "/__model__"; - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + if (!config_.model_dir().empty()) { + filename = config_.model_dir() + "/__model__"; + } else if (!config_.prog_file().empty() && !config_.params_file().empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used // in Python API `fluid.io.save_inference_model`. - filename = config_.prog_file; + filename = config_.prog_file(); } else { - if (config_.model_dir.empty() && config_.prog_file.empty()) { + if (config_.model_dir().empty() && config_.prog_file().empty()) { LOG(ERROR) << "Either model_dir or (prog_file, param_file) should be set."; return false; } LOG(ERROR) << string::Sprintf( - "not valid model path '%s' or program path '%s'.", config_.model_dir, - config_.param_file); + "not valid model path '%s' or program path '%s'.", config_.model_dir(), + config_.params_file()); return false; } @@ -472,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() { proto.ParseFromString(pb_content); } else { - proto.ParseFromString(config_.prog_file); + proto.ParseFromString(config_.prog_file()); } inference_program_.reset(new framework::ProgramDesc(proto)); return true; @@ -502,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() { new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetPersistable(true); - if (!config_.param_file.empty()) { + if (!config_.params_file().empty()) { params.push_back(new_var->Name()); } else { // append_op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load"); op->SetOutput("Out", {new_var->Name()}); - op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()}); + op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()}); op->CheckAttrs(); } } } - if (!config_.param_file.empty()) { + if (!config_.params_file().empty()) { // sort paramlist to have consistent ordering std::sort(params.begin(), params.end()); // append just the load_combine op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load_combine"); op->SetOutput("Out", params); - op->SetAttr("file_path", {config_.param_file}); + op->SetAttr("file_path", {config_.params_file()}); op->CheckAttrs(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 12ecb7c15e92c3efcdb27a7058e9481a6f476674..a6e126c5d533f4299ccc3deed7d116cabc71f75b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -35,8 +35,11 @@ using framework::proto::ProgramDesc; using framework::NaiveExecutor; using contrib::AnalysisConfig; -/* This predictor is based on the original native predictor with IR and Analysis - * support. It will optimize IR and Parameters in the runtime. +/** \brief This predictor is based on the original native predictor with IR and + * Analysis support. + * + * It will optimize IR and Parameters in the runtime. + * * TODO(Superjomn) Replace the Navive predictor? */ class AnalysisPredictor : public PaddlePredictor { diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index a361b34437ade36dfba2c99db800a7d77ada8704..6169e60541e4a14d560e719d56624b3219dbcefd 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -25,9 +25,9 @@ namespace paddle { using contrib::AnalysisConfig; TEST(AnalysisPredictor, analysis_off) { - AnalysisConfig config(false); - config.model_dir = FLAGS_dirname; - config.enable_ir_optim = false; + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(false); auto _predictor = CreatePaddlePredictor(config); auto* predictor = static_cast(_predictor.get()); @@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(true); #ifdef PADDLE_WITH_CUDA - AnalysisConfig config(true); - config.fraction_of_gpu_memory = 0.15; + config.EnableUseGpu(100, 0); #else - AnalysisConfig config; + config.DisableGpu(); #endif - config.model_dir = FLAGS_dirname; - config.enable_ir_optim = true; auto _predictor = CreatePaddlePredictor(config); auto* predictor = static_cast(_predictor.get()); @@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) { } // compare with NativePredictor - auto naive_predictor = CreatePaddlePredictor(config); + auto naive_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); std::vector naive_outputs; ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); ASSERT_EQ(naive_outputs.size(), 1UL); @@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) { TEST(AnalysisPredictor, ZeroCopy) { AnalysisConfig config; - config.model_dir = FLAGS_dirname; - config.use_feed_fetch_ops = false; - + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(false); auto predictor = CreatePaddlePredictor(config); auto w0 = predictor->GetInputTensor("firstw"); @@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) { TEST(AnalysisPredictor, Clone) { AnalysisConfig config; - config.model_dir = FLAGS_dirname; - config.use_feed_fetch_ops = true; - config.enable_ir_optim = true; + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(true); + config.SwitchIrOptim(true); std::vector> predictors; predictors.emplace_back(CreatePaddlePredictor(config)); diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 6a8b81cc57281b12cd3a4c89c863b20a824ce34a..e14d93de2c41f740bc175c8e59412d7b828dd381 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -19,8 +19,6 @@ limitations under the License. */ #pragma once -#define WITH_ANAKIN - #include #include "framework/core/net/net.h" diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 102147a493ed1454db1a78124200f163f68e555b..85e250aaaf4a18a261a4bfc5271670f93565a336 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -288,7 +288,7 @@ std::unique_ptr CreatePaddlePredictor< VLOG(3) << "create NativePaddlePredictor"; if (config.use_gpu) { // 1. GPU memeroy - PADDLE_ENFORCE_GT( + PADDLE_ENFORCE_GE( config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index c1fcd198ccda07bb6cdd9911716be911ffef6e8d..d2133bd467376c723a80a98725ac7c70234c54b0 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 78396397397c3125c3990073d6b2887ebb477ff2..54895679ca37362c7267677af80274b8de95e296 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) { #endif TEST(PassBuilder, Delete) { - contrib::AnalysisConfig config(false); + contrib::AnalysisConfig config; + config.DisableGpu(); config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); const auto& passes = config.pass_builder()->AllPasses(); auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 61ecd7bce683e40bbf89a343bfdbaa2b7051ae73..30215e480f908f353f00cbc9077e6c057222423a 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -36,12 +36,11 @@ namespace demo { */ void Main() { std::unique_ptr predictor; - paddle::contrib::AnalysisConfig config(true); - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.device = 0; + paddle::contrib::AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(FLAGS_modeldir + "/__params__", + FLAGS_modeldir + "/__model__"); config.EnableTensorRtEngine(); - config.fraction_of_gpu_memory = 0.1; // set by yourself predictor = CreatePaddlePredictor(config); VLOG(3) << "begin to process data"; diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index bc8891455dc8e4a30ddfcc5f89792296e59c2548..5320992b7e78f4aa0ea8950af03038c1953dd027 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -40,15 +40,14 @@ using contrib::AnalysisConfig; */ void Main(bool use_gpu) { std::unique_ptr predictor, analysis_predictor; - AnalysisConfig config(use_gpu); - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.device = 0; - if (FLAGS_use_gpu) { - config.fraction_of_gpu_memory = 0.1; // set by yourself + AnalysisConfig config; + if (use_gpu) { + config.EnableUseGpu(100, 0); } + config.SetModel(FLAGS_modeldir + "/__model__", + FLAGS_modeldir + "/__params__"); - predictor = CreatePaddlePredictor(config); + predictor = CreatePaddlePredictor(config.ToNativeConfig()); analysis_predictor = CreatePaddlePredictor(config); // Just a single batch of data. diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 569a487328e2f1febe2ca5014b232dbd51d28079..03c2aa3fb8094ce2996f513b90589de0ef903ae8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { // TODO(Superjomn) should avoid the case when a TensorArray is a // parameter. if (var_name == "feed" || var_name == "fetch") continue; - if (var->Type() == typeid(framework::LoDTensorArray)) { + if (var->IsType()) { VLOG(4) << "collect " << var_name; arrays_.push_back(var->GetMutable()); } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index 6a5ea64de66fcac44117d0d8f7798e8875703ec6..213c6891d0e2320689c8c69266d40611f295edc8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -27,8 +27,11 @@ namespace details { // training phase. struct TensorArrayBatchCleaner { TensorArrayBatchCleaner() { - valid_types_.insert(typeid(framework::Tensor)); - valid_types_.insert(typeid(framework::LoDTensor)); + constexpr auto kTensorId = framework::VarTypeTrait::kId; + constexpr auto kLoDTensorId = + framework::VarTypeTrait::kId; + valid_types_.insert(kTensorId); + valid_types_.insert(kLoDTensorId); } // Collect the variables that are not Tensor or LoDTensor, and reset them to a // bool(trick), because some of them are containers, and some operators just @@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner { bool no_tensor_flag_{true}; std::vector arrays_; - std::unordered_set valid_types_; + std::unordered_set valid_types_; std::unordered_set no_tensor_vars_; }; diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 9a393a61c4b4586009a022884e88f3f5c6392ed3..7830e859567747e6c05686335919e8346f76320d 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data, + const std::vector &lod) { + int size = lod[lod.size() - 1]; + tensor->shape.assign({size, 1}); + tensor->lod.assign({lod}); + TensorAssignData(tensor, data); +} + template static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, const std::vector> &data) { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index f05b9832da55f10b34eb2df914e443a478e5a4a4..ae6ac69854d91d44567ccd985791de5fd2b16f26 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -19,6 +19,8 @@ #include #include +/*! \file */ + // Here we include some header files with relative paths, for that in deploy, // the abstract path of this header file will be changed. #include "paddle_api.h" // NOLINT @@ -34,54 +36,219 @@ class AnalysisPredictor; namespace contrib { // NOTE WIP, not stable yet. -struct AnalysisConfig : public NativeConfig { - explicit AnalysisConfig(bool use_gpu = false); +struct AnalysisConfig { + AnalysisConfig() = default; explicit AnalysisConfig(const AnalysisConfig& other); - explicit AnalysisConfig(AnalysisConfig&& other); + explicit AnalysisConfig(const std::string& model_dir); + explicit AnalysisConfig(const std::string& prog_file, + const std::string& params_file); - // Determine whether to perform graph optimization. - bool enable_ir_optim = true; + /** Set model with a directory. + */ + void SetModel(const std::string& model_dir) { model_dir_ = model_dir; } + /** Set model with two specific pathes for program and parameters. + */ + void SetModel(const std::string& prog_file_path, + const std::string& params_file_path); + /** Set program file path. + */ + void SetProgFile(const std::string& x) { prog_file_ = x; } + /** Set parameter composed file path. + */ + void SetParamsFile(const std::string& x) { params_file_ = x; } + /** Get the model directory path. + */ + const std::string& model_dir() const { return model_dir_; } + /** Get the program file path. + */ + const std::string& prog_file() const { return prog_file_; } + /** Get the composed parameters file. + */ + const std::string& params_file() const { return params_file_; } - // Get a pass builder for customize the passes in IR analysis phase. - PassStrategy* pass_builder() const; + // GPU related. + + /** + * \brief Turn on GPU. + * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB. + * @param device_id the GPU card to use (default is 0). + */ + void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0); + /** Turn off the GPU. + */ + void DisableGpu(); + /** A bool state telling whether the GPU is turned on. + */ + bool use_gpu() const { return use_gpu_; } + /** Get the GPU device id. + */ + int gpu_device_id() const { return device_id_; } + /** Get the initial size in MB of the GPU memory pool. + */ + int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; } + /** Get the proportion of the initial memory pool size compared to the device. + */ + float fraction_of_gpu_memory_for_pool() const; + + /** \brief Control whether to perform IR graph optimization. + * + * If turned off, the AnalysisConfig will act just like a NativeConfig. + */ + void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; } + /** A boolean state tell whether the ir graph optimization is actived. + */ + bool ir_optim() const { return enable_ir_optim_; } - // NOT stable yet. - bool use_feed_fetch_ops{true}; + /** \brief INTERNAL Determine whether to use the feed and fetch operators. + * Just for internal development, not stable yet. + * When ZeroCopyTensor is used, this should turned off. + */ + void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; } + /** A boolean state telling whether to use the feed and fetch operators. + */ + bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; } + /** \brief Control whether to specify the inputs' names. + * + * The PaddleTensor type has a `name` member, assign it with the corresponding + * variable name. This is used only when the input PaddleTensors passed to the + * `PaddlePredictor.Run(...)` cannot follow the order in the training phase. + */ + void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; } + + /** A boolean state tell whether the input PaddleTensor names specified should + * be used to reorder the inputs in `PaddlePredictor.Run(...)`. + */ + bool specify_input_name() const { return specify_input_name_; } + + /** + * \brief Turn on the TensorRT engine. + * + * The TensorRT engine will accelerate some subgraphes in the original Fluid + * computation graph. In some models such as TensorRT50, GoogleNet and so on, + * it gains significant performance acceleration. + * + * @param workspace_size the memory size(in byte) used for TensorRT workspace. + * @param max_batch_size the maximum batch size of this prediction task, + * better set as small as possible, or performance loss. + * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a + * subgraph is less than this, it will not transfer to TensorRT engine. + */ void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1); - bool use_tensorrt() const { return use_tensorrt_; } + int max_batch_size = 1, int min_subgraph_size = 3); + /** A boolean state telling whether the TensorRT engine is used. + */ + bool tensorrt_engine_enabled() const { return use_tensorrt_; } + /** Control whther to debug IR graph analysis phase. + */ + void SwitchIrDebug(int x = true) { ir_debug_ = x; } + + /** Turn on MKLDNN. + */ void EnableMKLDNN(); - bool use_mkldnn() const { return use_mkldnn_; } + /** A boolean state telling whether to use the MKLDNN. + */ + bool mkldnn_enabled() const { return use_mkldnn_; } + + /** Set and get the number of cpu math library threads. + */ + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads); + /** An int state telling how many threads are used in the CPU math library. + */ + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; + } + + /** Transform the AnalysisConfig to NativeConfig. + */ + NativeConfig ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; + } + /** Specify the operator type list to use MKLDNN acceleration. + * @param op_list the operator type list. + */ void SetMKLDNNOp(std::unordered_set op_list) { mkldnn_enabled_op_types_ = op_list; } - // Specify the memory buffer of program and parameter + /** Specify the memory buffer of program and parameter + * @param prog_buffer the memory buffer of program. + * @param prog_buffer_size the size of the data. + * @param params_buffer the memory buffer of the composed parameters file. + * @param params_buffer_size the size of the commposed parameters data. + */ void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, - const char* program_buffer, size_t program_buffer_size); + const char* params_buffer, size_t params_buffer_size); + /** A boolean state telling whether the model is set from the CPU memory. + */ bool model_from_memory() const { return model_from_memory_; } friend class ::paddle::AnalysisPredictor; + /** NOTE just for developer, not an official API, easily to be broken. + * Get a pass builder for customize the passes in IR analysis phase. + */ + PassStrategy* pass_builder() const; + protected: + // Update the config. + void Update(); + + std::string SerializeInfoCache(); + + protected: + // Model pathes. + std::string model_dir_; + std::string prog_file_; + std::string params_file_; + + // GPU releated. + bool use_gpu_{false}; + int device_id_{0}; + uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. + + // TensorRT releated. bool use_tensorrt_{false}; - bool use_mkldnn_{false}; - std::unordered_set mkldnn_enabled_op_types_; + // For workspace_size, refer it from here: + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int tensorrt_workspace_size_; + // While TensorRT allows an engine optimized for a given max batch size + // to run at any smaller size, the performance for those smaller + // sizes may not be as well-optimized. Therefore, Max batch is best + // equivalent to the runtime batch size. int tensorrt_max_batchsize_; - std::unique_ptr pass_builder_; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int tensorrt_min_subgraph_size_{3}; + + bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; + bool model_from_memory_{false}; -}; -// Configurations for Anakin engine. -struct AnakinConfig : public PaddlePredictor::Config { - enum TargetType { NVGPU = 0, X86 }; - int device; - std::string model_file; - int max_batch_size{-1}; - TargetType target_type; + bool enable_ir_optim_{true}; + bool use_feed_fetch_ops_{true}; + bool ir_debug_{false}; + + bool specify_input_name_{false}; + + int cpu_math_library_num_threads_{1}; + + // A runtime cache, shouldn't be transferred to others. + std::string serialized_info_cache_; + + mutable std::unique_ptr pass_builder_; }; } // namespace contrib diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 1513a4b3b4f6620aa8bf53e082afa203af15ca13..3642f36127f1f8df30858e34bc0e1a8d09603775 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -13,61 +13,76 @@ // limitations under the License. #pragma once +/*! \file paddle_api.h + */ + #include #include #include #include +/*! \namespace paddle + */ namespace paddle { -// Data type. +/** paddle data type. + */ enum PaddleDType { FLOAT32, INT64, // TODO(Superjomn) support more data types if needed. }; -/* - * Memory menage for PaddleTensor. - * The PaddleBuf holds a buffer for data input or output. The memory can be - * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf - * should be reused for better performance. +/** + *\brief Memory menager for PaddleTensor. * - * For user allocated memory, the following API can be used: - * - PaddleBuf(void* data, size_t length) to set an external memory by - * specifying - * the memory address and length. - * - Reset(void* data, size_t length) to reset the PaddleBuf with an external - * memory. - * ATTENTION, for user allocated memory, deallocation should be done by users - * externally after the program finished. The PaddleBuf won't do any allocation - * or deallocation. + *The PaddleBuf holds a buffer for data input or output. The memory can be + *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf + *should be reused for better performance. * - * To have the PaddleBuf allocate and manage the memory: - * - PaddleBuf(size_t length) will allocate a memory of size `length`. - * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION - * if the allocated memory is larger than `length`, nothing will done. + *For user allocated memory, the following API can be used: + *- PaddleBuf(void* data, size_t length) to set an external memory by + *specifying + * the memory address and length. + *- Reset(void* data, size_t length) to reset the PaddleBuf with an external + *memory. + *ATTENTION, for user allocated memory, deallocation should be done by users + *externally after the program finished. The PaddleBuf won't do any allocation + *or deallocation. + * + *To have the PaddleBuf allocate and manage the memory: + *- PaddleBuf(size_t length) will allocate a memory of size `length`. + *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION + * if the allocated memory is larger than `length`, nothing will done. */ class PaddleBuf { public: - // PaddleBuf allocate memory internally, and manage it. + /** PaddleBuf allocate memory internally, and manage it. + */ explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} - // Set external memory, the PaddleBuf won't manage it. + /** Set external memory, the PaddleBuf won't manage it. + */ PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} - // Copy only available when memory is managed externally. + /** Copy only available when memory is managed externally. + */ explicit PaddleBuf(const PaddleBuf&); - // Resize the memory. + /** Resize the memory. + */ void Resize(size_t length); - // Reset to external memory, with address and length set. + /** Reset to external memory, with address and length set. + */ void Reset(void* data, size_t length); - // Tell whether the buffer is empty. + /** Tell whether the buffer is empty. + */ bool empty() const { return length_ == 0; } - // Get the memory address. + /** Get the memory address. + */ void* data() const { return data_; } - // Get the memory length. + /** Get the memory length. + */ size_t length() const { return length_; } ~PaddleBuf() { Free(); } @@ -83,7 +98,8 @@ class PaddleBuf { bool memory_owned_{true}; }; -// Basic input and output data structure for PaddlePredictor. +/** Basic input and output data structure for PaddlePredictor. + */ struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. @@ -94,19 +110,22 @@ struct PaddleTensor { }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; -// Tensor without copy, currently only supports AnalysisPredictor. +/** Tensor without copy, currently only supports AnalysisPredictor. + */ class ZeroCopyTensor { public: void Reshape(const std::vector& shape); - // Get the memory in CPU or GPU with specific data type, should Reshape first - // to tell the data size. - // Once can directly call this data to feed the data. - // This is for write the input tensor. + /** Get the memory in CPU or GPU with specific data type, should Reshape first + * to tell the data size. + * Once can directly call this data to feed the data. + * This is for write the input tensor. + */ template T* mutable_data(PaddlePlace place); - // Get the memory directly, will return the place and memory size by pointer. - // This is for reading the output tensor. + /** Get the memory directly, will return the place and memory size by pointer. + * This is for reading the output tensor. + */ template T* data(PaddlePlace* place, int* size) const; @@ -128,8 +147,7 @@ class ZeroCopyTensor { void* scope_{nullptr}; }; -/* - * A simple Inference API for Paddle. +/** A simple Inference API for Paddle. */ class PaddlePredictor { public: @@ -138,18 +156,20 @@ class PaddlePredictor { PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete; - // Predict an record. - // The caller should be responsible for allocating and releasing the memory of - // `inputs`. `inputs` should be available until Run returns. Caller should be - // responsible for the output tensor's buffer, either allocated or passed from - // outside. + /** Predict an record. + * The caller should be responsible for allocating and releasing the memory of + * `inputs`. `inputs` should be available until Run returns. Caller should be + * responsible for the output tensor's buffer, either allocated or passed from + * outside. + */ virtual bool Run(const std::vector& inputs, std::vector* output_data, int batch_size = -1) = 0; - // Zero copy input and output optimization. - // Get the input or output tensors, and operate on their memory directly, - // without copy. + /** Zero copy input and output optimization. + * Get the input or output tensors, and operate on their memory directly, + * without copy. + */ virtual std::unique_ptr GetInputTensor( const std::string& name) { return nullptr; @@ -160,16 +180,19 @@ class PaddlePredictor { } virtual bool ZeroCopyRun() { return false; } - // Clone a predictor that share the model weights, the Cloned predictor should - // be thread-safe. + /** Clone a predictor that share the model weights, the Cloned predictor + * should be thread-safe. + */ virtual std::unique_ptr Clone() = 0; - // Destroy the Predictor. + /** Destroy the Predictor. + */ virtual ~PaddlePredictor() = default; - // The common configs for all the predictors. + /** The common configs for all the predictors. + */ struct Config { - std::string model_dir; // path to the model directory. + std::string model_dir; /*!< path to the model directory. */ }; }; @@ -177,17 +200,21 @@ struct NativeConfig : public PaddlePredictor::Config { // GPU related fields. bool use_gpu{false}; int device{0}; - float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. + float fraction_of_gpu_memory{ + -1.f}; /*!< Change to a float in (0,1] if needed. */ // Specify the exact path of program and parameter files. std::string prog_file; std::string param_file; - // Specify the variable's name of each input if input tensors don't follow the - // `feeds` and `fetches` of the phase `save_inference_model`. + /** Specify the variable's name of each input if input tensors don't follow + * the + * `feeds` and `fetches` of the phase `save_inference_model`. + */ bool specify_input_name{false}; - // Set and get the number of cpu math library threads. + /** Set and get the number of cpu math library threads. + */ void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { cpu_math_library_num_threads_ = cpu_math_library_num_threads; } @@ -201,28 +228,33 @@ struct NativeConfig : public PaddlePredictor::Config { int cpu_math_library_num_threads_{1}; }; -// A factory to help create different predictors. -// -// Usage: -// -// NativeConfig config; -// ... // change the configs. -// auto native_predictor = CreatePaddlePredictor(config); -// -// FOR EXTENSION DEVELOPER: -// Different predictors are designated by config type. Similar configs can be -// merged, but there shouldn't be a huge config containing different fields for -// more than one kind of predictors. +/*! \fn std::unique_ptr CreatePaddlePredictor(const ConfigT& + * config); + * + * \brief A factory to help create different predictors. + * + * Usage: + * + * NativeConfig config; + * ... // change the configs. + * auto native_predictor = CreatePaddlePredictor(config); + * + * FOR EXTENSION DEVELOPER: + * Different predictors are designated by config type. Similar configs can be + * merged, but there shouldn't be a huge config containing different fields for + * more than one kind of predictors. + */ template std::unique_ptr CreatePaddlePredictor(const ConfigT& config); -// NOTE The following APIs are too trivial, we will discard it in the following -// versions. +/** NOTE The following APIs are too trivial, we will discard it in the following + * versions. + */ enum class PaddleEngineKind { - kNative = 0, // Use the native Fluid facility. - kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. - kAnalysis, // More optimization. - kAnakin // Use Anakin for inference, not mature yet. + kNative = 0, /*!< Use the native Fluid facility. */ + kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */ + kAnalysis, /*!< More optimization. */ + kAnakin /*!< Use Anakin for inference, not mature yet. */ }; template diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 92fb51d647cf4e2c8a4914d8df2e8b7b6318d1d1..1785bd520a17d5f5060d789b2e4e4f1eda26aa6a 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -26,9 +26,8 @@ limitations under the License. */ #include #include -#include "paddle_api.h" // NOLINT -#ifndef WITH_ANAKIN #include "paddle_analysis_config.h" // NOLINT -#else +#include "paddle_api.h" // NOLINT +#ifdef WITH_ANAKIN #include "paddle_anakin_config.h" // NOLINT #endif diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 40ca0d287ccde113a20abb1036af289a36f54e6c..9337ae55b76fe4726149f69b07a189dc0cf769c1 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -18,30 +18,39 @@ #include #include +/*! \file */ + +/*! \namespace paddle */ namespace paddle { -/* - * This is a pass builder based on string. It is part of inference API. + +/** This is a pass builder based on string. It is part of inference API. */ class PaddlePassBuilder { public: explicit PaddlePassBuilder(const std::vector &passes) : passes_(passes) {} + /** Append a pass to the end of the passes. */ void AppendPass(const std::string &pass_type); + /** Insert a pass to a specific position. + * @param idx the position to insert. + * @param pass_type the pass key. + */ void InsertPass(size_t idx, const std::string &pass_type); - // Delete the `idx`-th pass. + /** Delete the `idx`-th pass. */ void DeletePass(size_t idx); - // Delete all the passes that has type `pass_type`. + /** Delete all the passes that has type `pass_type`. */ void DeletePass(const std::string &pass_type); - // Visualize the computation graph after each pass by generating a DOT - // language file, one can draw them with the Graphviz toolkit. + /** Visualize the computation graph after each pass by generating a DOT + * language file, one can draw them with the Graphviz toolkit. + */ void TurnOnDebug(); - // Human-readible information. + /** Human-readible information. */ std::string DebugString(); const std::vector &AllPasses() const { return passes_; } @@ -50,23 +59,27 @@ class PaddlePassBuilder { std::vector passes_; }; -/* - * Pass strategy to help control the IR passes. +/**Pass strategy to help control the IR passes. */ class PassStrategy : public PaddlePassBuilder { public: explicit PassStrategy(const std::vector &passes) : PaddlePassBuilder(passes) {} - // The MKLDNN control exists in both CPU and GPU mode, because there can be - // still some CPU kernels running in CPU mode. + /** The MKLDNN control exists in both CPU and GPU mode, because there can be + * still some CPU kernels running in CPU mode. + */ virtual void EnableMKLDNN() = 0; + bool use_gpu() const { return use_gpu_; } + virtual ~PassStrategy() = default; + + protected: + bool use_gpu_{false}; }; -/* - * The CPU passes controller, it is used in AnalysisPredictor with CPU mode. +/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode. */ class CpuPassStrategy : public PassStrategy { public: @@ -88,6 +101,7 @@ class CpuPassStrategy : public PassStrategy { "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // }); + use_gpu_ = false; } virtual ~CpuPassStrategy() = default; @@ -111,23 +125,28 @@ class CpuPassStrategy : public PassStrategy { CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} }; -/* - * The GPU passes strategy, it is used in +/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. */ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); + + use_gpu_ = true; } GpuPassStrategy(const GpuPassStrategy &other) - : PassStrategy(other.AllPasses()) {} + : PassStrategy(other.AllPasses()) { + use_gpu_ = true; + } void EnableMKLDNN() override; diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 17f6c6d9f10abf99fd93364d1356e2b3ef1b3934..9afeafd176c70bc03166ec7732ae5e2faf67ea54 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) add_subdirectory(plugin) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc new file mode 100644 index 0000000000000000000000000000000000000000..9fecad6eb3889f48f2e0012a718ed0d04f34ae66 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/op_teller.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// Just tell by the op_types. +struct SimpleOpTypeSetTeller : public Teller { + SimpleOpTypeSetTeller() {} + + bool operator()(const std::string& op_type, + const framework::OpDesc& desc) override { + return teller_set.count(op_type); + } + + private: + std::unordered_set teller_set{ + {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", + "conv2d_transpose", "leaky_relu"}}; +}; + +bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { + for (auto& teller : tellers_) { + if ((*teller)(op_type, desc)) return true; + } + return false; +} + +OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h new file mode 100644 index 0000000000000000000000000000000000000000..b98f052bf2478098d74f19858ec79823d5ab1e2d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Single Op teller definition. + * One can override this and define a more complex tell logic, considerring more + * issues such as op_desc. + */ +struct Teller { + virtual bool operator()(const std::string& op_type, + const framework::OpDesc& desc) = 0; + + virtual ~Teller() = default; +}; +/* + * A real example: + * + * struct SomeTeller : public Teller { + * bool operator()(const std::string& op_type, + * const framework::OpDesc& desc) override { + * return op_type == "fc" && desc.Inputs().size() == 2; + * } + *}; + */ + +/* + * class OpTeller helps to tell whether a fluid + * operator can be transformed to a TensorRT layer. + */ +class OpTeller { + public: + static OpTeller& Global() { + static std::unique_ptr x(new OpTeller); + return *x; + } + + bool Tell(const std::string& op_type, const framework::OpDesc& desc); + + private: + OpTeller(); + + private: + std::vector> tellers_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 95bbc74a5961eb28a0d8fbd7c680c0740fc68d8a..131712ca88370aa977184fcb00d09f2283db110c 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -41,7 +41,7 @@ endfunction() if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") - inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) + inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL) else() # TODO: fix this test on MACOS and OPENBLAS, the reason is that # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS @@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) +inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) + ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL) # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") @@ -90,6 +90,11 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) +# seq_pool1 +set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") +download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) @@ -106,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 - "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") + "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv - "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") + "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 12d61d06ce188a2478448373427f2defae5a2524..5ad6e4a8570b309e94375234d673e27698999cb7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -165,12 +165,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(true); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 142801382b4fdeaa63f51390b63cf6db6cb8f60d..b9666e01adb23e0cbd9257bc55081c3a5001e887 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -98,20 +98,17 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, auto one_batch = data->NextBatch(); PaddleTensor input_tensor; input_tensor.name = "word"; - input_tensor.shape.assign({static_cast(one_batch.data.size()), 1}); - input_tensor.lod.assign({one_batch.lod}); input_tensor.dtype = PaddleDType::INT64; - TensorAssignData(&input_tensor, {one_batch.data}); + TensorAssignData(&input_tensor, {one_batch.data}, one_batch.lod); PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 8aaab6d6649e1d4b6db7695df0e9dd219c89422c..1318fbcbc4022457354fb34c727cf56ce26e12ec 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> query_data_all, title_data_all; + std::vector> query, title; std::vector lod1, lod2; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,22 +31,9 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= query_data_all.size()) { - data.query_data_all.assign(query_data_all.begin() + batch_iter, - query_data_all.begin() + batch_end); - data.title_data_all.assign(title_data_all.begin() + batch_iter, - title_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - CHECK(!data.query_data_all.empty()); - CHECK(!data.title_data_all.empty()); - CHECK_EQ(data.query_data_all.size(), data.title_data_all.size()); - for (size_t j = 0; j < data.query_data_all.size(); j++) { - // calculate lod - data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size()); - data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size()); - } + if (batch_end <= query.size()) { + GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -67,8 +52,8 @@ struct DataRecord { // load title data std::vector title_data; split_to_int64(data[1], ' ', &title_data); - query_data_all.push_back(std::move(query_data)); - title_data_all.push_back(std::move(title_data)); + query.push_back(std::move(query_data)); + title.push_back(std::move(title_data)); } num_samples = num_lines; } @@ -80,15 +65,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_query_tensor.name = "left"; lod_title_tensor.name = "right"; auto one_batch = data->NextBatch(); - int size1 = one_batch.lod1[one_batch.lod1.size() - 1]; // token batch size - int size2 = one_batch.lod2[one_batch.lod2.size() - 1]; // token batch size - lod_query_tensor.shape.assign({size1, 1}); - lod_query_tensor.lod.assign({one_batch.lod1}); - lod_title_tensor.shape.assign({size2, 1}); - lod_title_tensor.lod.assign({one_batch.lod2}); // assign data - TensorAssignData(&lod_query_tensor, one_batch.query_data_all); - TensorAssignData(&lod_title_tensor, one_batch.title_data_all); + TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); // Set inputs. input_slots->assign({lod_query_tensor, lod_title_tensor}); for (auto &tensor : *input_slots) { @@ -97,11 +76,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index f19a2ed59ef2f666393124323ffee2f1e79ccf06..6fef79dc4608acd6eee679ad4939e7684db98f5b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> word_data_all, mention_data_all; + std::vector> word, mention; std::vector lod; // two inputs have the same lod info. - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,20 +31,10 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= word_data_all.size()) { - data.word_data_all.assign(word_data_all.begin() + batch_iter, - word_data_all.begin() + batch_end); - data.mention_data_all.assign(mention_data_all.begin() + batch_iter, - mention_data_all.begin() + batch_end); - // Prepare LoDs - data.lod.push_back(0); - CHECK(!data.word_data_all.empty()); - CHECK(!data.mention_data_all.empty()); - CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); - for (size_t j = 0; j < data.word_data_all.size(); j++) { - // calculate lod - data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); - } + if (batch_end <= word.size()) { + GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end); + GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter, + batch_end); } batch_iter += batch_size; return data; @@ -65,27 +53,22 @@ struct DataRecord { // load mention data std::vector mention_data; split_to_int64(data[3], ' ', &mention_data); - word_data_all.push_back(std::move(word_data)); - mention_data_all.push_back(std::move(mention_data)); + word.push_back(std::move(word_data)); + mention.push_back(std::move(mention_data)); } num_samples = num_lines; } }; -void PrepareInputs(std::vector *input_slots, DataRecord *data, - int batch_size) { +void PrepareInputs(std::vector *input_slots, DataRecord *data) { PaddleTensor lod_word_tensor, lod_mention_tensor; lod_word_tensor.name = "word"; lod_mention_tensor.name = "mention"; auto one_batch = data->NextBatch(); - int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size - lod_word_tensor.shape.assign({size, 1}); - lod_word_tensor.lod.assign({one_batch.lod}); - lod_mention_tensor.shape.assign({size, 1}); - lod_mention_tensor.lod.assign({one_batch.lod}); // assign data - TensorAssignData(&lod_word_tensor, one_batch.word_data_all); - TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all); + TensorAssignData(&lod_word_tensor, one_batch.word, one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention, + one_batch.lod); // Set inputs. input_slots->assign({lod_word_tensor, lod_mention_tensor}); for (auto &tensor : *input_slots) { @@ -101,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], buffer_param.size()); } else { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/param"); } - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { @@ -116,7 +98,7 @@ void SetInput(std::vector> *inputs) { int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; for (int bid = 0; bid < epoch; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); + PrepareInputs(&input_slots, &data); (*inputs).emplace_back(input_slots); } } diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 764ae5ed8506a7ed7dc51a5c36d0dd7e9df925f3..629981d565f1b6eeabc192287cb9f892df21b8e4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -21,12 +21,10 @@ namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/params"; - cfg->prog_file = FLAGS_infer_model + "/model"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchIrOptim(); + cfg->SwitchSpecifyInputNames(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 17f4587a5093a2f1cd2d8acc0e17f2129ad36353..3c52afbfb8f60b1e9389d416a5640c9685d8e764 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, } void SetConfig(AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { @@ -225,10 +223,10 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_rnn1, profile) { - contrib::AnalysisConfig cfg(false); + contrib::AnalysisConfig cfg; SetConfig(&cfg); - cfg.fraction_of_gpu_memory = 0.1; - cfg.pass_builder()->TurnOnDebug(); + cfg.DisableGpu(); + cfg.SwitchIrDebug(); std::vector outputs; std::vector> input_slots_all; @@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) { TEST(Analyzer_rnn1, ZeroCopy) { AnalysisConfig config; SetConfig(&config); - config.use_feed_fetch_ops = false; + config.SwitchUseFeedFetchOps(false); PaddlePlace place; auto predictor = CreatePaddlePredictor(config); - config.use_feed_fetch_ops = true; - auto native_predictor = CreatePaddlePredictor(config); + config.SwitchUseFeedFetchOps(true); + auto native_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); - config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + config.SwitchUseFeedFetchOps( + true); // the analysis predictor needs feed/fetch. auto analysis_predictor = CreatePaddlePredictor(config); #define NEW_TENSOR(name__) \ @@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) { TEST(Analyzer_rnn1, ZeroCopyMultiThread) { AnalysisConfig config; SetConfig(&config); - config.use_feed_fetch_ops = false; + config.SwitchUseFeedFetchOps(false); #define NEW_TENSOR(name__) \ auto name__##_tensor = predictor->GetInputTensor(#name__); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index f8354e76871e7f489fd21f2f74e7402db01845c3..007f9f0b66a7b276f5f2e8500a3001788ad41e79 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -105,12 +105,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f5082cd60f1ae4e4eaf9dbe59a446ace900ee456..47c1d7375843e4bad212c1d7d621c9e6d45e5982 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -18,12 +18,9 @@ namespace paddle { namespace inference { struct DataRecord { - std::vector> title1_all, title2_all, title3_all, l1_all; std::vector> title1, title2, title3, l1; - std::vector title1_lod, title2_lod, title3_lod, l1_lod; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + std::vector lod1, lod2, lod3, l1_lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,41 +30,11 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= title1_all.size()) { - data.title1_all.assign(title1_all.begin() + batch_iter, - title1_all.begin() + batch_end); - data.title2_all.assign(title2_all.begin() + batch_iter, - title2_all.begin() + batch_end); - data.title3_all.assign(title3_all.begin() + batch_iter, - title3_all.begin() + batch_end); - data.l1_all.assign(l1_all.begin() + batch_iter, - l1_all.begin() + batch_end); - // Prepare LoDs - data.title1_lod.push_back(0); - data.title2_lod.push_back(0); - data.title3_lod.push_back(0); - data.l1_lod.push_back(0); - CHECK(!data.title1_all.empty()); - CHECK(!data.title2_all.empty()); - CHECK(!data.title3_all.empty()); - CHECK(!data.l1_all.empty()); - CHECK_EQ(data.title1_all.size(), data.title2_all.size()); - CHECK_EQ(data.title1_all.size(), data.title3_all.size()); - CHECK_EQ(data.title1_all.size(), data.l1_all.size()); - for (size_t j = 0; j < data.title1_all.size(); j++) { - data.title1.push_back(data.title1_all[j]); - data.title2.push_back(data.title2_all[j]); - data.title3.push_back(data.title3_all[j]); - data.l1.push_back(data.l1_all[j]); - // calculate lod - data.title1_lod.push_back(data.title1_lod.back() + - data.title1_all[j].size()); - data.title2_lod.push_back(data.title2_lod.back() + - data.title2_all[j].size()); - data.title3_lod.push_back(data.title3_lod.back() + - data.title3_all[j].size()); - data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); - } + if (batch_end <= title1.size()) { + GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end); + GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end); + GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -92,10 +59,10 @@ struct DataRecord { // load l1 data std::vector l1_data; split_to_int64(data[3], ' ', &l1_data); - title1_all.push_back(std::move(title1_data)); - title2_all.push_back(std::move(title2_data)); - title3_all.push_back(std::move(title3_data)); - l1_all.push_back(std::move(l1_data)); + title1.push_back(std::move(title1_data)); + title2.push_back(std::move(title2_data)); + title3.push_back(std::move(title3_data)); + l1.push_back(std::move(l1_data)); } num_samples = num_lines; } @@ -109,24 +76,11 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, title3_tensor.name = "title3"; l1_tensor.name = "l1"; auto one_batch = data->NextBatch(); - int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; - title1_tensor.shape.assign({title1_size, 1}); - title1_tensor.lod.assign({one_batch.title1_lod}); - int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; - title2_tensor.shape.assign({title2_size, 1}); - title2_tensor.lod.assign({one_batch.title2_lod}); - int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; - title3_tensor.shape.assign({title3_size, 1}); - title3_tensor.lod.assign({one_batch.title3_lod}); - int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; - l1_tensor.shape.assign({l1_size, 1}); - l1_tensor.lod.assign({one_batch.l1_lod}); - // assign data - TensorAssignData(&title1_tensor, one_batch.title1); - TensorAssignData(&title2_tensor, one_batch.title2); - TensorAssignData(&title3_tensor, one_batch.title3); - TensorAssignData(&l1_tensor, one_batch.l1); + TensorAssignData(&title1_tensor, one_batch.title1, one_batch.lod1); + TensorAssignData(&title2_tensor, one_batch.title2, one_batch.lod2); + TensorAssignData(&title3_tensor, one_batch.title3, one_batch.lod3); + TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); // Set inputs. input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); for (auto &tensor : *input_slots) { @@ -135,11 +89,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..a1742f606819334e7b15e644f8b9e330795bf16e --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -0,0 +1,186 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct OneSlotInBatch { + std::string name; + std::vector> data; + std::vector shape; + std::vector lod; +}; + +struct DataRecord { + std::vector> batched_data; + std::map>> datasets; + size_t batch_iter{0}, num_samples; // total number of samples + + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) { + Load(path); + Prepare(batch_size); + } + + void Load(const std::string &path) { + std::ifstream file(path); + constexpr int num_slots = 154; + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + std::vector slot_data; + split_to_float(data[1], ' ', &slot_data); + std::string name = data[0]; + PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0, + "line %d, %s should be divisible", num_lines, name); + datasets[name].emplace_back(std::move(slot_data)); + } + num_samples = num_lines / num_slots; + PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast(num_lines), + "num samples should be divisible"); + PADDLE_ENFORCE_GT(num_samples, 0); + } + + void Prepare(int bs) { + for (auto it = datasets.begin(); it != datasets.end(); ++it) { + PADDLE_ENFORCE_EQ(it->second.size(), num_samples, + "size of each slot should be equal"); + } + size_t num_batches = num_samples / bs; + EXPECT_GT(num_batches, 0); + batched_data.resize(num_batches); + for (auto &one_batch : batched_data) { + one_batch.resize(datasets.size()); + size_t i = 0; + for (auto it = datasets.begin(); it != datasets.end(); ++it) { + auto &slot = one_batch[i]; + slot.name = it->first; + slot.data.resize(bs); + slot.lod.resize(bs + 1); + slot.lod[0] = 0; + auto &lod = slot.lod; + auto &datas = it->second; + for (int k = 0; k < bs; ++k) { + size_t id = k + batch_iter * bs; + std::copy(datas[id].begin(), datas[id].end(), + std::back_inserter(slot.data[k])); + size_t len = datas[id].size() / 11; + PADDLE_ENFORCE_EQ(len * 11, datas[id].size(), + "%s %d size should be divisible", slot.name, id); + lod[k + 1] = lod[k] + len; + } + slot.shape.assign({static_cast(lod[bs]), 11}); + i++; + } + } + } + + const std::vector &NextBatch() { + if (batch_iter >= batched_data.size() - 1) { + batch_iter = -1; + } + return batched_data[++batch_iter]; + } +}; + +static void TensorAssignSlot(PaddleTensor *tensor, const OneSlotInBatch &slot) { + tensor->name = slot.name + "_embed"; + tensor->shape = slot.shape; + tensor->dtype = PaddleDType::FLOAT32; + tensor->lod.clear(); + tensor->lod.emplace_back(slot.lod); + TensorAssignData(tensor, slot.data); +} + +void PrepareInputs(std::vector *input_slots, DataRecord *data) { + const auto &one_batch = data->NextBatch(); + input_slots->resize(one_batch.size()); + for (size_t i = 0; i < one_batch.size(); ++i) { + auto &slot = one_batch[i]; + TensorAssignSlot(&((*input_slots)[i]), slot); + } +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->pass_builder()->TurnOnDebug(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.batched_data.size() : 1; + LOG(INFO) << "number of samples: " + << data.batched_data.size() * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data); + (*inputs).emplace_back(input_slots); + } +} + +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_seq_pool1, profile) { profile(); } + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_seq_pool1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; + EXPECT_EQ(num_ops, 349); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 79f3c81ade450fa00419b652042b2cfc79b08e4c..7b448a3200351f902df277f7a653cf7114becba0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -47,11 +47,10 @@ struct DataReader { }; void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index d73bccefd5fc8a8ad8679b7de3feac50f786daed..5a77b53a8513cdbef5620d36ba5e0722ae993916 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) { } void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/__params__"; - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + cfg->DisableGpu(); + cfg->SwitchIrDebug(); + cfg->SwitchSpecifyInputNames(); // TODO(TJ): fix fusion gru cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); } diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index 7046bce303e2bd46197ab512ae273500b9af88bf..cf0f1d5c18c79e34c96d4301dbf13c924ae2a3f0 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os, num_spaces++; os << *reinterpret_cast(&config); if (!config.model_from_memory()) { - os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; - os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.params_file() + << "\n"; } else { os << GenSpaces(num_spaces) << "prog_file and param_file: load from memory \n"; } - os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() << "\n"; + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() + << "\n"; + os << GenSpaces(num_spaces) + << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; os << GenSpaces(num_spaces) - << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; - os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() + << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n"; + os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled() << "\n"; - os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index b0c8f395ce05fbfceaec3d8b69367292eca714e4..41d033df85811a4730cab8b3275aaffd1ba338e5 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -132,7 +132,8 @@ std::unordered_map GetFuseStatis(PaddlePredictor *predictor, void SetFakeImageInput(std::vector> *inputs, const std::string &dirname, bool is_combined = true, std::string model_filename = "model", - std::string params_filename = "params") { + std::string params_filename = "params", + const std::vector *feed_names = nullptr) { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); std::vector> feed_target_shapes = GetFeedTargetShapes( @@ -146,29 +147,47 @@ void SetFakeImageInput(std::vector> *inputs, os << "}\n"; } LOG(INFO) << os.str(); - - int dim1 = feed_target_shapes[0][1]; - int dim2 = feed_target_shapes[0][2]; - int dim3 = feed_target_shapes[0][3]; - - PaddleTensor input; - std::vector shape({FLAGS_batch_size, dim1, dim2, dim3}); - input.shape = shape; - input.dtype = PaddleDType::FLOAT32; - - // fill input data, for profile easily, do not use random data here. - size_t size = FLAGS_batch_size * dim1 * dim2 * dim3; - input.data.Resize(size * sizeof(float)); - float *input_data = static_cast(input.data.data()); - for (size_t i = 0; i < size; i++) { - *(input_data + i) = static_cast(i) / size; + if (feed_names) { + PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size()); + } + std::vector input_slots(feed_target_shapes.size()); + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + const auto &feed_shape = feed_target_shapes[i]; + auto &input = input_slots[i]; + std::vector shape({FLAGS_batch_size}); + for (size_t s = 1; s < feed_shape.size(); ++s) { + shape.push_back(static_cast(feed_shape[s])); + } + if (feed_names) { + input.name = (*feed_names)[i]; + } + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + size_t len = std::accumulate(shape.begin(), shape.end(), 1, + [](int a, int b) { return a * b; }); + input.data.Resize(len * sizeof(float)); + input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); + float *input_data = static_cast(input.data.data()); + // fill input data, for profile easily, do not use random data here. + for (size_t j = 0; j < len; ++j) { + *(input_data + j) = static_cast(j) / len; + } } - - std::vector input_slots; - input_slots.assign({input}); (*inputs).emplace_back(input_slots); } +void GetInputPerBatch(const std::vector> &in, + std::vector> *out, + std::vector *lod, size_t batch_iter, + size_t batch_end) { + lod->clear(); + lod->push_back(0); + for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) { + out->push_back(*it); + lod->push_back(lod->back() + (*it).size()); // calculate lod + } +} + void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, @@ -309,7 +328,10 @@ void CompareNativeAndAnalysis( const std::vector> &inputs) { PrintConfig(config, true); std::vector native_outputs, analysis_outputs; - TestOneThreadPrediction(config, inputs, &native_outputs, false); + const auto *analysis_config = + reinterpret_cast(config); + auto native_config = analysis_config->ToNativeConfig(); + TestOneThreadPrediction(&native_config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); CompareResult(analysis_outputs, native_outputs); } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index d3bd035c1c49c926fc9f5ed83085b2e6d9ca8c93..21df6eab814dad5e2f654bf6d9558a2f9859d5ae 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -46,22 +46,20 @@ void SetConfig(contrib::AnalysisConfig* config, std::string model_dir, bool use_gpu, bool use_tensorrt, int batch_size) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { - config->prog_file = model_dir + "/" + FLAGS_prog_filename; - config->param_file = model_dir + "/" + FLAGS_param_filename; + config->SetModel(model_dir + "/" + FLAGS_prog_filename, + model_dir + "/" + FLAGS_param_filename); } else { - config->model_dir = model_dir; + config->SetModel(model_dir); } if (use_gpu) { - config->use_gpu = true; - config->device = 0; - config->fraction_of_gpu_memory = 0.15; + config->EnableUseGpu(100, 0); if (use_tensorrt) { config->EnableTensorRtEngine(1 << 10, batch_size); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); } else { - config->enable_ir_optim = true; + config->SwitchIrOptim(); } } } @@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { - contrib::AnalysisConfig config(true); + contrib::AnalysisConfig config; + config.EnableUseGpu(100, 0); config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); @@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) { &native_outputs, false); std::vector analysis_outputs; - contrib::AnalysisConfig analysis_config(true); + contrib::AnalysisConfig analysis_config; + analysis_config.EnableUseGpu(50, 0); SetConfig(&analysis_config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestOneThreadPrediction( @@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) { TEST(AnalysisPredictor, use_gpu) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - AnalysisConfig config(true); - config.model_dir = model_dir; - config.fraction_of_gpu_memory = 0.15; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); config.pass_builder()->TurnOnDebug(); std::vector> inputs_all; diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7..29f0f034a2aab50330d4d0127b870a5cb00d56a5 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") function (inference_download install_dir url filename) message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") message(STATUS "finish downloading ${filename}") endfunction() function (inference_download_and_uncompress install_dir url filename) inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} + WORKING_DIRECTORY ${install_dir} + ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4a14eb941cd98e333a3e85aff064e6099b3be396..e53a6a562ad1ed2ca02683b07cf6d4b56bc2cde7 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -46,14 +46,14 @@ endif() register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() # conv_fusion_op needs cudnn 7 above - if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 855c4d70677395992e2bf685c910cbea2d37b20b..49e734ce96b0d38b59102575250a020e6924362a 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", - in_var->Type().name()); + framework::ToTypeName(in_var->Type())); } PADDLE_ENFORCE_NOT_NULL(input); diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index 6446cab5ec5f889dccaef90484476e55c4852dee..2e7f3edd55c3353bacddec3dd4ffaba9e0208136 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase { OpComment comment; PADDLE_ENFORCE(context->HasInput("X"), "Input(X) of %s operator must not be null", comment.type); - auto dim_x = context->GetInputDim("X"); - context->SetOutputDim("Out", context->GetInputDim("X")); context->ShareLoD("X", "Out"); } diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 48800947fd387bf4d84a85e82fdcd7efa3f08de5..0360cf5273591946570cac47e2578e43f498b550 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); - if (framework::IsType(og_outside.Type())) { + if (og_outside.IsType()) { auto &outside_tensor = og_outside.Get(); auto &inside_tensor = detail::Ref(og_inside.GetMutable()); inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.ShareDataWith(outside_tensor); - } else if (framework::IsType( - og_outside.Type())) { + } else if (og_outside.IsType()) { auto &outside_array = og_outside.Get(); auto &inside_array = detail::Ref(og_inside.GetMutable()); @@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase { var->IsType(), "Currently the type of var only can be LoDTensorArray, " "or LoDTensor, but the received var[%s] is %s.", - inside_grad_name, var->Type().name()); + inside_grad_name, framework::ToTypeName(var->Type())); if (var->IsType()) { auto &inside_tensor = var->Get(); diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index 92d394eb3c5aeb84605179cb2b5106f56a13f88e..f172431e483f38665251617e6fcfddb4bcc0d9d4 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -19,6 +19,10 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cudnn_helper.h" +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); +DECLARE_int64(cudnn_exhaustive_search_times); + namespace paddle { namespace operators { @@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; template class AlgorithmsCache { public: + AlgorithmsCache() : search_times_(0) { hash_.clear(); } // Caches the best algorithm for a given // combination of tensor dimensions & compute data type. TAlgorithm GetAlgorithm( @@ -54,9 +59,14 @@ class AlgorithmsCache { int algorithmFlags, // can set for different data type std::function gen_func); + TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, + std::function gen_func); + private: std::unordered_map hash_; std::mutex mutex_; + + int search_times_; }; template @@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache::GetAlgorithm( return hash_[seed]; } +template +TAlgorithm AlgorithmsCache::GetAlgorithm( + int64_t area, int search_times, int algorithmFlags, + std::function gen_func) { + if (hash_.find(area) != hash_.end()) { + return hash_[area]; + } + if (search_times_ < search_times) { + auto algo = gen_func(); + hash_[area] = algo; + ++search_times_; + return algo; + } + TAlgorithm algo; + int64_t min = static_cast(INT_MAX); + for (const auto& m : hash_) { + if (m.first < min) { + min = m.first; + algo = m.second; + } + } + return algo; +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_fusion_op.cc b/paddle/fluid/operators/conv_fusion_op.cc index 9bdedb10e0b1bc2d45c084bbc070875117675b75..23b8087e781da30ed7b66ba651f8071ecb7aaf50 100644 --- a/paddle/fluid/operators/conv_fusion_op.cc +++ b/paddle/fluid/operators/conv_fusion_op.cc @@ -28,6 +28,8 @@ namespace operators { // x is Input, // z is ResidualData, // bias is Bias +// When `split_channels` is set, y will be splitted into multiple outputs, +// each output has split_channels[i] number of channels. class Conv2DFusionOpMaker : public Conv2DOpMaker { protected: void Apply() override { @@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " "'relux' , 'tanh', 'band_pass'") .SetDefault("relu"); + AddAttr>( + "split_channels", + "When `split_channels` are set, there will be multiple outputs, the " + "output size is equal to the number of `split_channels`.") + .SetDefault({}); + AddOutput("Outputs", + "This Outputs is used when setting `split_channels`." + "Usually used to fuse conv with same input and same filter size, " + "padding, stride, dilation size.") + .AsDuplicable() + .AsDispensable(); + AddInput("AlgoCache", + "The cache of convolution algorithm, a RAW type variable.") + .AsDispensable(); + AddAttr( + "search_times", + "The number of exhaustive search times for convolution algorithm.") + .SetDefault(-1); } }; + +class Conv2DFusionOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvOp should not be null."); + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + + std::vector oshape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], strides[i])); + } + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvOp should not be null."); + ctx->SetOutputDim("Output", framework::make_ddim(oshape)); + std::vector channels = + ctx->Attrs().Get>("split_channels"); + if (channels.size()) { + PADDLE_ENFORCE(ctx->HasOutputs("Outputs"), + "Output(Outputs) of ConvOp should not be null."); + std::vector oshapes; + oshapes.reserve(channels.size()); + for (size_t i = 0; i < channels.size(); ++i) { + oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]}); + } + ctx->SetOutputsDim("Outputs", oshapes); + } + } +}; + // TODO(qingqing): add gradient operator for conv2d_fusion } // namespace operators @@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker, - ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker); + ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 3235ad52b999e1ca3f992034781edaab9921a300..d8b997cca613f660046106512fc03bf55f9b992d 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -16,13 +16,14 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/cudnn_helper.h" -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); +DEFINE_int64(cudnn_exhaustive_search_times, -1, + "Exhaustive search times for cuDNN convolution, " + "defalut is 1, only search once."); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { + auto search_func = [&]() { + int returned_algo_count; + std::array + fwd_perf_stat; + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); + VLOG(3) << "Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = fwd_perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " " + << stat.memory; + } + return fwd_perf_stat[0].algo; + }; AlgorithmsCache* algo_cache = nullptr; - if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { + int search_times = ctx.Attr("search_times"); + search_times = std::max( + static_cast(FLAGS_cudnn_exhaustive_search_times), search_times); + if (search_times > 0) { + // The searched algo will be cached by `search_times` times for + // different input dimension. For other dimensions, select the algo + // of closest area. + auto var_name = ctx.Inputs("AlgoCache")[0]; algo_cache = ctx.scope() - .FindVar(kCUDNNFwdAlgoCache) + .FindVar(var_name) ->GetMutable>(); + algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0, + search_func); } else { - algo_cache = - const_cast(ctx.scope()) - .Var(kCUDNNFwdAlgoCache) - ->GetMutable>(); + // Cache searched algo in Var(kCUDNNFwdAlgoCache). + // all conv ops use the same kCUDNNFwdAlgoCache variable. + if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { + algo_cache = + ctx.scope() + .FindVar(kCUDNNFwdAlgoCache) + ->GetMutable>(); + } else { + // TODO(qingqing) remove const_cast + algo_cache = + const_cast(ctx.scope().parent()) + ->Var(kCUDNNFwdAlgoCache) + ->GetMutable>(); + } + algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings, + dilations, 0, search_func); } - algo = algo_cache->GetAlgorithm( - x_dims, f_dims, strides, paddings, dilations, 0, [&]() { - int returned_algo_count; - std::array - fwd_perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, - output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - fwd_perf_stat.data(), cudnn_workspace, - workspace_size_limit)); - }; - workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); - VLOG(3) << "Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = fwd_perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return fwd_perf_stat[0].algo; - }); VLOG(3) << "choose algo " << algo; } @@ -161,9 +181,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if ((activation == "identity") && - (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && - (!residual)) { + if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. // But test in some case, the speed is slower, change to use @@ -197,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } + std::vector channels = ctx.Attr>("split_channels"); + if (channels.size()) { + auto outs = ctx.MultiOutput("Outputs"); + if (x_dims[0] == 1) { + // share data with Output + framework::Tensor t; + t.ShareDataWith(*output); + auto y_dims = output->dims(); + t.Resize({y_dims[1], y_dims[2], y_dims[3]}); + int s = 0; + for (size_t i = 0; i < channels.size(); ++i) { + int e = s + channels[i]; + outs[i]->ShareDataWith(t.Slice(s, e)); + outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]}); + s = e; + } + } else { + // TODO(qingiqng): do copy when batch size large than 1 + PADDLE_THROW("Batch size greater than 1 is Unsupported"); + } + } } }; #endif @@ -204,7 +243,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 8c116c4abfe42296b616dc536821e9be55a8be84..03d9d466c3238c6c853bca75f5b9791a0841ff78 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/conv_op.h" @@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, } } -template +template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + bool is_INT8 = + std::is_same::value || std::is_same::value; + if (!is_INT8) { + ComputeFP32(ctx); + } else { + ComputeINT8(ctx); + } + } + void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); auto& dev_ctx = @@ -274,6 +284,271 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } + void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { + const bool is_test = ctx.Attr("is_test"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + + bool fuse_relu = ctx.Attr("fuse_relu"); + + bool force_fp32_output = ctx.Attr("force_fp32_output"); + + bool is_conv3d = strides.size() == 3U; + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && + dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + + PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); + + const T* input_data = input->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + GetWeightsTz(weights_tz, g, is_conv3d); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType) + : paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + + if (force_fp32_output) { + dst_dt = paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + } + + // Get unique name for storing MKLDNN primitives + std::string key; + key.reserve(MaxKeyLength); + platform::ConvMKLDNNHandler::AppendKey( + &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, + input->format(), dst_dt, ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + + std::shared_ptr conv_p = nullptr; + std::shared_ptr src_memory_p = nullptr; + std::shared_ptr user_src_memory_p = nullptr; + std::shared_ptr dst_memory_p = nullptr; + std::vector pipeline; + std::shared_ptr conv_pd = + nullptr; + std::shared_ptr handler = nullptr; + + auto prim_key = key + "@conv_p"; + auto dst_key = key + "@dst_mem_p"; + auto src_key = key + "@src_mem_p"; + auto user_src_key = key + "@user_src_mem_p"; + auto src_reorder_key = key + "@src_mem_preorder_p"; + conv_p = std::static_pointer_cast( + dev_ctx.GetBlob(prim_key)); + if (conv_p == nullptr || !is_test) { + const K* filter_data = filter->data(); + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + + bool is_multi_channel = scale_weights_data.size() > 1; + + int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] + : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + output_shift_scale[i] = + scale_out_data; // weights data will contain 0 + // in some models, then weights + // scale couldn't be calculated + else + output_shift_scale[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + + auto user_src_md = + platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + ((g) == 1) ? mkldnn::memory::format::oihw + : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + std::vector bias_tz; + + auto src_md = + platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, chosen_memory_format); + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, + memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, output_shift_scale, is_test); + } else { + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, + paddings, mkldnn_engine, fuse_relu, + output_shift_scale, is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler->AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + + std::shared_ptr weights_memory_p; + int mask_reorder = + is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, true, scale_weights_data, + mask_reorder); + + if (!force_fp32_output) { + if (fuse_relu) { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + + // create convolution op primitive + auto scale_bias_key = key + "@scale_bias"; + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = handler->AcquireBiasMemory( + user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( + user_bias_memory_p, pipeline, is_test, true, scale_bias_data, + mask_reorder); + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + } else { + auto src_memory_reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(src_reorder_key)); + src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(src_key)); + if (src_memory_reorder_p) { + user_src_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(user_src_key)); + user_src_memory_p->set_data_handle(to_void_cast(input_data)); + } else if (src_memory_p) { + src_memory_p->set_data_handle(to_void_cast(input_data)); + } + + dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); + conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); + if (conv_pd) { + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + } + if (!force_fp32_output) { + if (fuse_relu) { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } + if (src_memory_reorder_p) { + pipeline.push_back(*src_memory_reorder_p); + } + pipeline.push_back(*conv_p); + } + // push primitive to stream and wait until it's executed + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + } private: mkldnn::primitive_attr CreatePostOps(bool fuse_relu, @@ -301,6 +576,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return conv_attr; } + mkldnn::primitive_attr CreatePostOps( + bool fuse_relu, const std::vector output_shift_scale) const { + mkldnn::primitive_attr conv_attr; + mkldnn::post_ops post_operations; + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 1.0f; // beta + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, @@ -325,6 +617,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { p_conv_pd); } + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, const bool fuse_relu, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, + padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& bias, const memory::desc& dst, @@ -349,6 +668,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return std::unique_ptr( p_conv_pd); } + + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& bias, const memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, const bool fuse_relu, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, bias, dst, + stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } }; template @@ -555,7 +902,17 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, U8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, S8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, @@ -565,7 +922,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8e0d2824953a372b96d5819be658636f9a3d78ba..c8b33b8932ddd3bb9706d5b555ca68df4560a31e 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( #endif auto input_data_type = ctx.Input("Input")->type(); - auto filter_data_type = ctx.Input("Filter")->type(); - PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, - "input and filter data type should be consistent"); - + if (input_data_type != framework::proto::VarType::INT8 && + input_data_type != framework::proto::VarType::UINT8) { + auto filter_data_type = ctx.Input("Filter")->type(); + PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + "input and filter data type should be consistent"); + } if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); @@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { "whenever convolution output is as an input to residual " "connection.") .SetDefault(false); + AddAttr("Scale_in", + "Scale_in to be used for int8 input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_out", + "Scale_out to be used for int8 output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_in_eltwise", + "Scale_in_eltwise to be used for int8 eltwise input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + AddAttr("force_fp32_output", + "(bool, default false) Only used in mkldnn INT8 kernel") + .SetDefault(false); // TODO(dzhwinter): need to registered layout transform function AddAttr("workspace_size_MB", "Only used in cudnn kernel. workspace size for cudnn, in MB, " diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 2519f5e7acdb7828743c6e114adfe5e530058406..eaa288edc554d2b62eb67ca01ed2459a88772430 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" @@ -30,6 +29,7 @@ namespace operators { using Tensor = framework::Tensor; constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNINT8 = 2; +constexpr int MaxKeyLength = 256; // Base convolution operator definations for other conv // like operators to reuse the implementation. @@ -158,10 +158,7 @@ class GemmConvKernel : public framework::OpKernel { // to call the matrix multiplication interface. Tensor col_matrix; if (is_expand) { - auto tmp_allocation_ptr = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - framework::product(col_shape) * sizeof(T)); - col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); + col = context.AllocateTmpTensor(col_shape, dev_ctx); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -293,10 +290,7 @@ class GemmConvGradKernel : public framework::OpKernel { // to call the matrix multiplication interface. Tensor col_matrix; if (is_expand) { - auto tmp_allocation_ptr = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - framework::product(col_shape) * sizeof(T)); - col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); + col = context.AllocateTmpTensor(col_shape, dev_ctx); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 2d7d33bd4f9b42b644444912570375bad92ba6c2..cfc2cac7beb8a13526cefc94c127ffc2aea533df 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) { } out->mutable_data(out_dims, context.GetPlace()); auto x_stride = framework::stride(x->dims()); - auto out_stride = framework::stride(out->dims()); auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index f2ba75485c58789de848b8833a1a527d45ced83c..1bf41ed948b5bd4fbd49587f072f5debfa81d77c 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { @@ -22,239 +22,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -struct CudnnRNNCache { - CudnnRNNCache() { - x_desc_ = NULL; - y_desc_ = NULL; - dx_desc_ = NULL; - dy_desc_ = NULL; - } - ~CudnnRNNCache() { release(); } - - cudnnRNNDescriptor_t rnn_desc_; - cudnnTensorDescriptor_t *x_desc_; - cudnnTensorDescriptor_t *y_desc_; - cudnnTensorDescriptor_t *dx_desc_; - cudnnTensorDescriptor_t *dy_desc_; - - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; - - cudnnTensorDescriptor_t dhx_desc_; - cudnnTensorDescriptor_t dcx_desc_; - cudnnTensorDescriptor_t dhy_desc_; - cudnnTensorDescriptor_t dcy_desc_; - - cudnnTensorDescriptor_t output_x_desc_; - cudnnTensorDescriptor_t output_y_desc_; - - cudnnDropoutDescriptor_t dropout_desc_; - - size_t weights_size_; - cudnnFilterDescriptor_t w_desc_; - cudnnFilterDescriptor_t dw_desc_; - - size_t workspace_size_; - size_t reserve_size_; - Tensor reserve_data_; - Tensor workspace_data_; - - Tensor dropout_state_; - - size_t max_length_; - - float dropout_prob_; - bool is_bidirec_; - - int batch_size_; - int input_size_; - int hidden_size_; - int num_layers_; - int seed_; - - void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, - size_t max_len, int batch_size, int input_size, int hidden_size, - int num_layers, float dropout_prob, bool is_bidirec, int seed, - int weight_numel) { - max_length_ = max_len; - batch_size_ = batch_size; - input_size_ = input_size; - hidden_size_ = hidden_size; - num_layers_ = num_layers; - dropout_prob_ = dropout_prob; - is_bidirec_ = is_bidirec; - seed_ = seed; - - x_desc_ = new cudnnTensorDescriptor_t[max_length_]; - y_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; - int dim_a[3]; - int stride_a[3]; - - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); - dim_a[0] = batch_size_; - dim_a[1] = input_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - dim_a[0] = batch_size_; - dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - } - - dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); - dim_a[1] = batch_size_; - dim_a[2] = hidden_size_; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - CUDNN_ENFORCE( - platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); - - size_t state_size; - CUDNN_ENFORCE( - platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); - dropout_state_.Resize({static_cast(state_size)})); - auto *dropout_state_data = - dropout_state_.mutable_data(ctx.GetPlace()); - CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( - dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, - seed_)); - - CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); - -#if CUDNN_VERSION >= 6000 - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); -#else - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_DATA_FLOAT)); -#endif - - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( - handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); - - PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, - "cudnn lstm weight size should be SAME"); - int dim_w[3]; - dim_w[0] = weights_size_ / sizeof(float); - dim_w[1] = 1; - dim_w[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( - handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); - - reserve_data_.Resize({static_cast(reserve_size_)}); - reserve_data_.mutable_data(ctx.GetPlace()); - - workspace_data_.Resize({static_cast(workspace_size_)}); - workspace_data_.mutable_data(ctx.GetPlace()); - } - - void release() { - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); - } - - delete[] x_desc_; - delete[] y_desc_; - delete[] dx_desc_; - delete[] dy_desc_; - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); - } -}; - template class CudnnLSTMGPUKernel : public framework::OpKernel { public: @@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { auto input_w_numel = w->numel(); auto batch_size = x->dims()[1]; - cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, - hidden_size, num_layers, dropout_prob, is_bidirec, - seed, input_w_numel); + cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, + input_size, hidden_size, num_layers, dropout_prob, + is_bidirec, seed, input_w_numel); } auto run_seq_len = x->dims()[0]; @@ -380,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { ->GetMutable(); auto input_dims = input->dims(); - auto weight_dims = weight->dims(); auto init_h_dims = init_h->dims(); auto init_c_dims = init_c->dims(); in_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..7f18b839271a29523cc06c999c28cc0394717397 --- /dev/null +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + framework::Tensor reserve_data_; + framework::Tensor workspace_data_; + + framework::Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len, + int batch_size, int input_size, int hidden_size, int num_layers, + float dropout_prob, bool is_bidirec, int seed, int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = dropout_state_.mutable_data(place); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(place); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(place); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff90784ed089cd620a4f0a908f196bcdda..7c0fda4169b5e1cf663d04b78b6425d73965c292 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index a800d5df0a7cbc668a0217350098bce2bfdcfa70..8660bc219c12fe8676e3be646c4b878a39700201 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -25,7 +25,7 @@ namespace detail { */ template inline T& Ref(T* ptr, ARGS&&... args) { - PADDLE_ENFORCE(ptr != nullptr, args...); + PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); return *ptr; } diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 0b7c470fe72eb4270b8d5b2d227642d85683c16d..94419d1f9a4ba654952e0aedb46ab94ea8d5c0a8 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -27,8 +27,8 @@ struct StridedMemcpyFunctor; template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<0> src_stride, framework::Dim<0> dst_dim, - framework::Dim<0> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); @@ -50,18 +50,18 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<1> src_stride, framework::Dim<1> dst_dim, - framework::Dim<1> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); - memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { #ifdef PADDLE_WITH_CUDA auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); - memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0], cuda_ctx.stream()); #else PADDLE_THROW("Paddle is not compiled with GPU"); @@ -73,19 +73,19 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim src_stride, framework::Dim dst_dim, - framework::Dim dst_stride, T* dst) const { - for (int64_t i = 0; i < dst_dim.head; ++i) { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim[0]; ++i) { StridedMemcpyFunctor func; - func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); - src += src_stride.head; - dst += dst_stride.head; + func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst); + src += src_stride[0]; + dst += dst_stride[0]; } } }; template -struct StridedCopyDimVisitor : public boost::static_visitor { +struct StridedCopyDimVisitor { StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_stride, T* dst) @@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor { dst_stride_(dst_stride), dst_(dst) {} - template - void operator()(Dim dst_dim) const { - Dim src_stride = boost::get(src_stride_); - Dim dst_stride = boost::get(dst_stride_); - constexpr int dim = Dim::dimensions; - StridedMemcpyFunctor functor; - functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); + template + void operator()(const framework::Dim& dst_dim) const { + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(), + dst_); } const platform::DeviceContext& dev_ctx_; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index acd5993154ed03f206f20082231feb5059ef32e1..6337a4837a64cef2ce0e7bae70d8ba5b8994958e 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // blockx is multiple of 32. int blockx = std::min( static_cast(((feature_width * num_priors + 31) >> 5) << 5), - 512L); + static_cast(512L)); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index fddd6884017c35112ba48f245759f5d846b55f9a..a652d4d95750ff89f0ef63338031e80eed6f92bb 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); - auto gt_classes_dims = ctx->GetInputDim("GtClasses"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto im_info_dims = ctx->GetInputDim("ImInfo"); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 2c46803fd00e4d34ad9a5e2664b2cab1206ef01f..06e48f1262a74dfdfd6d38e71cd02116f3e6eca5 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Variances"), "Input(Variances) shouldn't be null."); - auto scores_dims = ctx->GetInputDim("Scores"); - auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas"); - auto im_info_dims = ctx->GetInputDim("ImInfo"); - auto anchors_dims = ctx->GetInputDim("Anchors"); - auto variances_dims = ctx->GetInputDim("Variances"); - ctx->SetOutputDim("RpnRois", {-1, 4}); ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); } diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index dc6c3d5a668f97ed6e4baa949e5e8be9942c70cf..0b8053e8d03c426e5a1b619e67bc8dae21c5c024 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { auto anchor_dims = ctx->GetInputDim("Anchor"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, "The rank of Input(Anchor) must be 2."); diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index eab4297c737bbf2424fff8814b73942cd520d778..8a25d57e613ee91df40f8040cbb8dbbe8034adb2 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -7,56 +7,52 @@ if(WITH_GRPC) else() set(cc_generic_services "true") endif() -configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) +configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY) +# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - if(WITH_GRPC) - grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc - PROTO send_recv.proto + set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) + grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc + variable_response.cc + collective_client.cc collective_server.cc + ${GRPC_SRCS} + PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto DEPS lod_tensor selected_rows_functor memory) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) - cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) - - cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) - - cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) - - if(WITH_GPU) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor - selected_rows_functor scope math_function SERIAL) - endif() + cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc + DEPS ${RPC_DEPS} scope profiler math_function SERIAL) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) else() set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc - PROTO send_recv.proto + set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) + brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc + variable_response.cc + collective_client.cc collective_server.cc + ${BRPC_SRCS} + PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto DEPS lod_tensor selected_rows memory) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) - - set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor - proto_desc lookup_sparse_table_op snappystream snappy zlib) - - cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS ${brpc_test_depends} SERIAL) + set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) + cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc + DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL) +endif() - cc_test(brpc_serde_test SRCS brpc_serde_test.cc - DEPS ${brpc_test_depends} SERIAL) - if(WITH_GPU) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) - endif() +cc_test(rpc_server_test SRCS rpc_server_test.cc + DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL) +cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) +cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) +if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + selected_rows_functor scope math_function SERIAL) endif() diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc similarity index 99% rename from paddle/fluid/operators/distributed/brpc_client.cc rename to paddle/fluid/operators/distributed/brpc/brpc_client.cc index 62e32977b8cd7e70ddd9f5d879c6844ff346ce80..87bdb83503783b32720eb57bd303ad7eb4bc17a8 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h similarity index 97% rename from paddle/fluid/operators/distributed/brpc_client.h rename to paddle/fluid/operators/distributed/brpc/brpc_client.h index 80cc81bff37916b558aa87104f194eb37a7fc309..2066ade8a5621f2c201b76690421a943db44535e 100644 --- a/paddle/fluid/operators/distributed/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -31,10 +31,10 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc similarity index 97% rename from paddle/fluid/operators/distributed/brpc_rdma_pool.cc rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc index e1be5673dfbc51e6a7beb8e3b8b2f162872a1988..d5c614001e0b2ff24812d5326318883de938fbb8 100644 --- a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc @@ -14,7 +14,7 @@ #ifdef PADDLE_WITH_BRPC_RDMA -#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" #include "brpc/channel.h" #include "brpc/rdma/rdma_helper.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h similarity index 100% rename from paddle/fluid/operators/distributed/brpc_rdma_pool.h rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc similarity index 96% rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc index e4604db3a381616c7420f816f0b49a015c925bd4..49e048f07a2396824a51db5c6012206bd8848e82 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h similarity index 96% rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h index ffaf44222422882d6c9a3cb8efbd335e8072fd49..a5bdc331eb29c7c0fe00d7f346025426b51e1cb3 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc similarity index 97% rename from paddle/fluid/operators/distributed/brpc_serde_test.cc rename to paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc index 2a2dc72150a3295205724cb04b4a246efe32d5e8..b902d3db487789a417ed0e5ffc032e7e06ba43fb 100644 --- a/paddle/fluid/operators/distributed/brpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc similarity index 98% rename from paddle/fluid/operators/distributed/brpc_server.cc rename to paddle/fluid/operators/distributed/brpc/brpc_server.cc index 78d41aeac50a31d13fb3304909e7aed07f6a852a..cbe0bd09c7b272c35b78818aa9e26feeb5497779 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h similarity index 95% rename from paddle/fluid/operators/distributed/brpc_server.h rename to paddle/fluid/operators/distributed/brpc/brpc_server.h index 85a7ad0dfe843dad483d43631b69a79d75211ce9..78bbe5adc0813d7cf29963c78947d52bcaea9643 100644 --- a/paddle/fluid/operators/distributed/brpc_server.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.h @@ -19,8 +19,8 @@ limitations under the License. */ #include #include "brpc/server.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc similarity index 96% rename from paddle/fluid/operators/distributed/brpc_variable_response.cc rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc index 75306d72334abc98f1527d4557509bf2a1f8a9bb..eb78917ad2d8b49f1b2d1f8dfb2cbca8a9a9610d 100644 --- a/paddle/fluid/operators/distributed/brpc_variable_response.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc @@ -13,7 +13,7 @@ // limitations under the License. // -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h similarity index 97% rename from paddle/fluid/operators/distributed/brpc_variable_response.h rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.h index b0b91a42a01c79bc76aa19c8745ae8f7d3e9a297..6282f08a725367f74dbcf1fa6a2ad49469d64725 100644 --- a/paddle/fluid/operators/distributed/brpc_variable_response.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h index 53b03c531a2b8859e6d7c904e9ab4d1b7a5c8b9b..6a3a450a1fd2e52c341f824f4816ca13784bda85 100644 --- a/paddle/fluid/operators/distributed/collective_client.h +++ b/paddle/fluid/operators/distributed/collective_client.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler.h" DECLARE_int32(rpc_deadline); diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h index a23dc18b4de86421a0995b9951e0ae6f4dc76150..03c688a78e1cbaba4afe4585e619956188a767a1 100644 --- a/paddle/fluid/operators/distributed/collective_server.h +++ b/paddle/fluid/operators/distributed/collective_server.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/rpc_server.h" diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc index 0a9c69e393257068371e88253b82a500f58ed837..46c761000c31e24d859cb400a4162b06a6c80171 100644 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/collective_client.h" #include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/math/math_function.h" @@ -52,12 +52,12 @@ std::unique_ptr GenerateVars(platform::Place place) { framework::Scope* scope = new framework::Scope(); framework::Variable* var = scope->Var("var1"); auto* slr = var->GetMutable(); - slr->set_height(1000); + slr->set_height(20000); auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({3, 5})); + tensor->Resize(framework::make_ddim({20000, 1024})); tensor->mutable_data(place); paddle::operators::math::set_constant(ctx, tensor, 32.7); @@ -83,6 +83,7 @@ void Gather(const std::vector& vars, } TEST(PREFETCH, GPU) { + setenv("FLAGS_max_body_size", "2147483647", 1); platform::CUDAPlace place; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/distributed/distributed.h similarity index 80% rename from paddle/fluid/operators/detail/macros.h rename to paddle/fluid/operators/distributed/distributed.h index 6f4a15caa5542a45cd8e26a72b055ca8948069d0..3a9f92259875749ab2ddf26c18cd230c58a61c44 100644 --- a/paddle/fluid/operators/detail/macros.h +++ b/paddle/fluid/operators/distributed/distributed.h @@ -18,15 +18,15 @@ #ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc_server.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer #define RPCCLIENT_T paddle::operators::distributed::GRPCClient #else // PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/brpc_client.h" -#include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer #define RPCCLIENT_T paddle::operators::distributed::BRPCClient diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h new file mode 100644 index 0000000000000000000000000000000000000000..f1c662be9af67b418e17987e4eb1ff0a2809c3e3 --- /dev/null +++ b/paddle/fluid/operators/distributed/distributed_pb.h @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_DISTRIBUTE + +#ifdef PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#else // PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#endif // PADDLE_WITH_GRPC + +#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc similarity index 96% rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc index d192f54ee0c924b772045d9b6a01701f640e07c7..c2cb0d7f04eb67275030e841740f0cdb291d9f87 100644 --- a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc @@ -17,7 +17,7 @@ limitations under the License. */ // file and did some modifications so that we can send gRPC // requests without too much copying of the tensor data. -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h similarity index 100% rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc similarity index 99% rename from paddle/fluid/operators/distributed/grpc_client.cc rename to paddle/fluid/operators/distributed/grpc/grpc_client.cc index 8c54159a41e3361322d0fa7ce36534447680207d..7875c16c3cf412ee06fa7c8eb36400b1096f156b 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h similarity index 98% rename from paddle/fluid/operators/distributed/grpc_client.h rename to paddle/fluid/operators/distributed/grpc/grpc_client.h index 01bf46cc313b4707c7af7a9605926a8b298d679d..fa77d21257647b23b8ac9f8161a216d36d7df773 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h @@ -39,10 +39,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc similarity index 96% rename from paddle/fluid/operators/distributed/grpc_serde.cc rename to paddle/fluid/operators/distributed/grpc/grpc_serde.cc index a9dea9cfd2eeaa7e7ed8f052d2f51f5893c1e2e3..6df4fd36f95b127a0bbc0725b83c4494b160785f 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/port.h" diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h similarity index 93% rename from paddle/fluid/operators/distributed/grpc_serde.h rename to paddle/fluid/operators/distributed/grpc/grpc_serde.h index 16f5293b0eb413dc43a28193cfd224090aeed659..c9a57beb3a6a7a7cc9973ff0e5325a3daa6d98a9 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h @@ -27,8 +27,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/port.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc similarity index 97% rename from paddle/fluid/operators/distributed/grpc_serde_test.cc rename to paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc index 1936c2c623a779c2599aa560247fa5e24f28cd62..749c1bf39a48608876c77a74aa98be51947cf3b3 100644 --- a/paddle/fluid/operators/distributed/grpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/distributed/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc similarity index 99% rename from paddle/fluid/operators/distributed/grpc_server.cc rename to paddle/fluid/operators/distributed/grpc/grpc_server.cc index cda102e78d2de2876d54418574b7e07211fc92b4..08f777e279e34da0c0ac89afd3f660fa089599fe 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/distributed/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc_server.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" using ::grpc::ServerAsyncResponseWriter; diff --git a/paddle/fluid/operators/distributed/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h similarity index 93% rename from paddle/fluid/operators/distributed/grpc_server.h rename to paddle/fluid/operators/distributed/grpc/grpc_server.h index d2524f5e65db6dedab78f45e17380359b58a3d11..2fd3a7a74073b52770158cf47b1c86cedae78291 100644 --- a/paddle/fluid/operators/distributed/grpc_server.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h @@ -29,11 +29,10 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/grpc_service.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h similarity index 98% rename from paddle/fluid/operators/distributed/grpc_service.h rename to paddle/fluid/operators/distributed/grpc/grpc_service.h index 537429b5fe989269d437b6dfe558c0a7dcfc2dcc..0b5c5151e637f0d7aeafaefefb01006ffe0f05c8 100644 --- a/paddle/fluid/operators/distributed/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h @@ -23,7 +23,7 @@ #include #include #include -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/platform/profiler.h" // NOTE: This method was originally created by tensorflow diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc similarity index 99% rename from paddle/fluid/operators/distributed/grpc_variable_response.cc rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc index 76ad02b0300a58cd19ff2541ad53d067197f4177..87e83ca53bf13ac4a015d56572ba073e51722c3e 100644 --- a/paddle/fluid/operators/distributed/grpc_variable_response.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc @@ -19,7 +19,7 @@ #include #endif -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h similarity index 89% rename from paddle/fluid/operators/distributed/grpc_variable_response.h rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.h index 89df07c92cd33bcb76c8539b5566d74fa21bba5e..3ca1d89f750313791c833a9f1f58760406e690c2 100644 --- a/paddle/fluid/operators/distributed/grpc_variable_response.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h @@ -22,13 +22,11 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/variable_response.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index cf14538b1c284d297242197088a66cc156b1762c..c63d65348880ebb4085d83059d9fead6456216d7 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" @@ -32,7 +32,7 @@ namespace paddle { namespace operators { namespace distributed { -using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; @@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection( auto& id_tensor = scope->FindVar(id_name)->Get(); auto* out_tensor = scope->FindVar(out_name)->GetMutable(); + + PADDLE_ENFORCE_GT( + out_tensor->numel(), 0, + "When calling this method, the LoDTensor's numel must larger than zero. " + "Please check LoDTensor::Resize has been called first."); + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( auto row_numel = dims[1]; - for (size_t i = 0; i < dims[0]; ++i) { + for (int64_t i = 0; i < dims[0]; ++i) { auto id = ids_in_this_section[i]; auto origin_id = id + abs_sections[section_idx]; auto& offsets = id_to_offset[origin_id]; @@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context) { - auto& local_scope = context.scope().NewScope(); + const framework::ExecutionContext& context, + const framework::Scope& scope) { + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name, out_var_names.push_back(out_name + "@" + epmap[i]); } - auto& id_tensor = local_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); - for (size_t i = 0; i < id_tensor.numel(); ++i) { + for (int64_t i = 0; i < id_tensor.numel(); ++i) { ids_vector.push_back(id_data[i]); } } else { @@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, PADDLE_THROW("paddle is not compiled with CUDA!"); #else auto cpu_place = platform::CPUPlace(); - framework::Tensor cpu_tensor; + framework::LoDTensor cpu_tensor; auto* cpu_tensor_data = cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); auto stream = @@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, context, &local_scope, &actual_ctx); - - context.scope().DeleteScope(&local_scope); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53b0fbfb51f60fa86351cca34fd1665c7802591b..2f850a0332256d458e79ed9da361c86eb8a2f780 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context); + const framework::ExecutionContext& context, + const framework::Scope& scope); + +template +void prefetch_with_reconstruct(const std::string& id_name, + const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, + framework::LoDTensor* original) { + prefetch(id_name, out_name, table_names, epmap, height_sections, context, + scope); + auto& out = scope.FindVar(out_name)->Get(); + auto& ids = scope.FindVar(id_name)->Get(); + auto* original_value = original->data(); + auto* out_value = out.data(); + size_t original_width = original->numel() / original->dims()[0]; + + bool is_on_cpu_place = true; + if (!platform::is_cpu_place(ids.place())) { + is_on_cpu_place = false; + } + if (is_on_cpu_place) { + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& actual_ctx = *pool.Get(context.GetPlace()); + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; + auto stream = + static_cast(&actual_ctx)->stream(); + memory::Copy(boost::get(ids.place()), original_row, + platform::CPUPlace(), out_rows, original_width * sizeof(T), + stream); + } +#endif + } +} }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index d2b0eb6ca6de1984dc7cfc2a662c88d5e56e1e05..27ca1f4edc04f5fca54b1a6340243634a596939c 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -84,7 +84,9 @@ class ProtoEncodeHelper { ~ProtoEncodeHelper() { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - paddle::platform::throw_on_error(p_ <= limit_); + if (paddle::platform::is_error(p_ <= limit_)) { + paddle::platform::throw_on_error(p_ <= limit_); + } #undef REPLACE_ENFORCE_GLOG } diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 122619d41b25da488742b4a7192b6a18b8bf9283..cc5b9c29a12ec5386041dfeea22fd388d94115e6 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/distributed/rpc_server.h" + #include #include #include #include - -#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc index c3dd459fc4e8c4bd304c09d7a3ed4f456c4dc69f..089ea623f18a27d14342d1d69700ef624477eba4 100644 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_server.h" diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 2637619f304d246fa535bbfc7be3474209b63b0f..b39eef04d8d1de77cb951f90a10e69eebb495282 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -1,4 +1,3 @@ - /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,13 +17,8 @@ package sendrecv; option cc_generic_services = @cc_generic_services@; service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} @@ -33,19 +27,12 @@ service SendRecvService { rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } -// VariableMessage is serialized paddle variable message. -// It can be: -// LoDTensor -// SelectedRows enum VarType { LOD_TENSOR = 0; SELECTED_ROWS = 1; NCCL_ID = 2; } -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. message VariableMessage { enum Type { // Pod Types @@ -62,21 +49,14 @@ message VariableMessage { string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: Type data_type = 3; repeated int64 dims = 4; - // lod details: int64 lod_level = 5; repeated LodData lod = 6; - // selected_rows height, aka. original dim0 int64 slr_height = 7; - // tensor data bytes serialized = 8; - // selected_rows data bytes rows = 9; - // Look up table block execution output variable name. string out_varname = 10; // If 1, the ps server will start profiling, the ps // server stops profiling and generates a profile to /tmp/profile_ps_* diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 25e2f77fb74f273c1cc5263046202b3fbf1084a3..e5c96507e97267c3d0519a27a36cbac0336c7f28 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/port.h" diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 6a87178be5daa02444c41a26f6e6c067713dd96f..5457101a5c9f3eb22f76877676f4a8a750a0f914 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/platform/port.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index a4324f67bb99bfdaa19c1a6dba8e907f17635d14..294cae5f44a4701c064c3669af7b4138f68659e6 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -25,7 +25,7 @@ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" DECLARE_string(rpc_server_profile_path); diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 3c0b7ff24f9cf0b128d32aa185e5e3cb47d9c4b9..a8bb597cbd59290df1347c164d37104c6ac431e9 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) if(WITH_GPU AND NOT WIN32) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) - op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common) + op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) endif() set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc index a3b5ff8d17602a73555ad95fa8b27e0c2d855f77..a09bff351fc0c7ae3858358701bf309e9d2f592a 100644 --- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc +++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/string/printf.h" diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc index 8754856e140ed074782e6fccb8991571a12babab..7275ab201f471b7d1687b0871f784771923fdfda 100644 --- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc +++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc index ef574ccdf48dcf6074a777bcb7667b114415674c..80d712a0e02751485c78887782cf3dce76846cc1 100644 --- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 20870ea07ebf673f1632230f9346c36b264eba30..629f364d712694d2bc1d2483711f5247561ed1da 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc index 86425aba8c4a0f5926042dfbd87ad8e6f2c89a2c..52b96d5f8ef7851aa0a99c1d64771f1dc84c66ad 100644 --- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc +++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 0399ff41007fbe10da8d53a05671eb0cfb475a5f..48065437e38b2c5457c135cce03075f81110a329 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc index 8ca2877d8adad643089587fcee0917affa537f7d..ae1b10c3b6c7b4b3b1c4eaa3a9b2454e1edb4360 100644 --- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 0bf4bebbc90028246acff940be7436fa5ba08c6c..e2c2147ab5e9a76498a0fd9e1f18b75eed32e91e 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h index acc9b1e6227942781db61a3bc50b2ac95865f79c..6676ecd1c85d70cd5961af2fb1537e77b10e41bc 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h @@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel { } else { PADDLE_THROW( "% should be LoDTensor or SelectedRows, but the received type is %s", - ctx.Inputs("Ids")[0], ids_var->Type().name()); + ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type())); } } }; diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc index a73cb08eca272b044501d48e7b8c5b7dc8553a50..1598e1d0a47efe317e1dcf7d8595fa5b18829553 100644 --- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc +++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index a8b8a67a114b956f2d6b1b072ef343a179114b34..7a7a3989c047ae379cc14e2f783662db99239445 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel { z = ctx.Output("Out"); } else { PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - x_var->Type().name()); + framework::ToTypeName(x_var->Type())); } z->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 41644d8cc175cc276c7668d113bf00566c7fdc6e..fd2a98cb45f446e80a4be1b50e94ee611cd23e62 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), "Rank of first input must >= rank of second input."); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 75dbf1d8bf5cb692dcf7b88e9f4c486ab3839701..339408249771d99434ba87ab95b41f0884f2950f 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel { auto& expand_times = context.Attr>("expand_times"); auto* out0 = context.Output("Out"); Eigen::DSizes bcast_dims; - auto x_dims = in0->dims(); for (size_t i = 0; i < expand_times.size(); ++i) { bcast_dims[i] = expand_times[i]; } diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 1ed8a2ddd1e6577effd4b8761026418d86000f17..38e57a41ed253eab4d0713af8bb14bac19041f6d 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel { auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); auto output = ctx.Output("Out"); - auto in_dims = input->dims(); auto w_dims = w->dims(); auto out_dims = output->dims(); int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 38cb33e79048add25060e2a7b3bd486287748df8..c86430524e182acd66c61e3e01672a32f15a62c3 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/fill_constant_op.h" namespace paddle { namespace operators { -class FillConstantInferShape : public framework::InferShapeBase { +class FillConstantOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of FillConstantOp should not be null."); - auto &shape = ctx->Attrs().Get>("shape"); + auto& shape = ctx->Attrs().Get>("shape"); ctx->SetOutputDim("Out", framework::make_ddim(shape)); } -}; - -class FillConstantOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto data_type = - static_cast(Attr("dtype")); - auto value = Attr("value"); - auto force_cpu = Attr("force_cpu"); - - framework::Tensor *tensor = nullptr; - auto &out_var = *scope.FindVar(Output("Out")); - - if (out_var.IsType()) { - tensor = out_var.GetMutable(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else if (out_var.IsType()) { - tensor = out_var.GetMutable()->mutable_value(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else { - PADDLE_THROW( - "fill constant op's output only" - "supports SelectedRows and LoDTensor"); - } - - if (force_cpu) { - auto cpu = platform::CPUPlace(); - tensor->mutable_data(cpu, data_type); - } else { - tensor->mutable_data(dev_place, data_type); - } - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - math::set_constant(dev_ctx, tensor, value); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.GetPlace()); } }; class FillConstantOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto data_type = static_cast( + boost::get(op_desc.GetAttr("dtype"))); + auto& out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetDataType(data_type); + } }; class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -107,7 +79,13 @@ Fill up a variable with specified constant value. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, - ops::FillConstantInferShape, ops::FillConstantOpMaker, - paddle::framework::EmptyGradOpMaker, - ops::FillConstantOpVarTypeInference); + +REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker, + ops::FillConstantOpVarTypeInference, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..77027b5a87d4ab4c16a1b850d1d38c929f68dcf3 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h new file mode 100644 index 0000000000000000000000000000000000000000..417c5b4da611c7f0c2a4408d0eb3c69e56208f47 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +template +class FillConstantKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto data_type = + static_cast(ctx.Attr("dtype")); + auto value = ctx.Attr("value"); + auto force_cpu = ctx.Attr("force_cpu"); + + framework::Tensor *tensor = nullptr; + + framework::Variable *out_var = ctx.OutputVar("Out"); + + if (out_var->IsType()) { + tensor = out_var->GetMutable(); + tensor->Resize( + framework::make_ddim(ctx.Attr>("shape"))); + } else if (out_var->IsType()) { + tensor = out_var->GetMutable()->mutable_value(); + tensor->Resize( + framework::make_ddim(ctx.Attr>("shape"))); + } else { + PADDLE_THROW( + "fill constant op's output only" + "supports SelectedRows and LoDTensor"); + } + + if (force_cpu) { + tensor->mutable_data(platform::CPUPlace(), data_type); + } else { + tensor->mutable_data(ctx.GetPlace(), data_type); + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(ctx.GetPlace()); + math::set_constant(dev_ctx, tensor, value); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index a0397acab1267365b8aeba30a63152b61b5b25bb..42ab8e99662e1ec67b7a4061b274e84103a7d5b1 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -1,6 +1,10 @@ include(operators) -register_operators(EXCLUDES fusion_transpose_flatten_concat_op) +register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op) if (WITH_GPU) op_library(fusion_transpose_flatten_concat_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) + op_library(fusion_conv_inception_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") + endif() endif() diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index f1466f17fec3546bd88bcdb350c26c47a1ee364a..c8282aefe43b14e3889d86b2aecaa569c7076d4f 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -241,15 +241,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { bool is_reverse = ctx.Attr("is_reverse"); \ bool use_peepholes = ctx.Attr("use_peepholes"); -#define INIT_BASE_SIZES \ - auto ids_dims = ids->dims(); /* T x M*/ \ - auto ids_numel = ids->numel(); /* T x 1*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - int64_t row_number = embeddings->dims()[0]; \ - int64_t row_width = embeddings->dims()[1]; \ +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = framework::product(ids_dims); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ const int D4 = wh_dims[1]; #define INIT_BASE_INPUT_DATAS \ diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe4c73f4723355d4b56d075423de29b45b9cd4e4 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input W of FusedEmbeddingSeqPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input Ids of FusedEmbeddingSeqPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output of FusedEmbeddingSeqPoolOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + const std::string& combiner = ctx->Attrs().Get("combiner"); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_GE(ids_dims.size(), 1, + "The dim size of the 'Ids' tensor must greater than 1."); + PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + // we only support sum now + PADDLE_ENFORCE_EQ(combiner, "sum"); + + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + + if (ctx->IsRuntime()) { + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + const auto& ids_lod = ids_var->Get().lod(); + + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1u, + "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + + int64_t batch_size = ids_lod[0].size() - 1; + + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); + } else { + // in compile time, the lod level of ids must be 1 + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); + + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("combiner", + "(string, default sum) " + "A string specifying the reduction op. Currently sum " + "are supported, sum computes the weighted sum of the " + "embedding results for each row.") + .SetDefault("sum"); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddComment(R"DOC( +FusedEmbeddingSeqPool Operator. + +Computes embeddings for the given ids and weights. + +This operator is used to perform lookups on the parameter W, +then computes the weighted sum of the lookups results for each row +and concatenated into a dense tensor. + +The input Ids should carry the LoD (Level of Details) information. +And the output will change the LoD information with input Ids. + +)DOC"); + } +}; + +class FusedEmbeddingSeqPoolOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { + return "fused_embedding_seq_pool_grad"; + } +}; + +class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FusedEmbeddingSeqPoolOpGradVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, + ops::FusedEmbeddingSeqPoolOpGradDescMaker, + ops::FusedEmbeddingSeqPoolOpMaker); +REGISTER_OPERATOR(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolOpGrad, + ops::FusedEmbeddingSeqPoolOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool, + ops::FusedEmbeddingSeqPoolKernel, + ops::FusedEmbeddingSeqPoolKernel); +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolGradKernel, + ops::FusedEmbeddingSeqPoolGradKernel); diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..758432fd9e4197302e0bd8f76a1ca7c524026a70 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +struct EmbeddingVSumFunctor { + void operator()(const framework::ExecutionContext &context, + const LoDTensor *table_t, const LoDTensor *ids_t, + LoDTensor *output_t) { + auto *table = table_t->data(); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + int64_t last_dim = output_t->dims()[1]; + const int64_t *ids = ids_t->data(); + auto ids_lod = ids_t->lod()[0]; + int64_t ids_count = ids_t->numel() / ids_lod.back(); + + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i] * ids_count; + for (int64_t j = 0; j != ids_count; ++j) { + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin + j] * row_width, + output + i * last_dim + j * row_width); + } + + for (int64_t r = (ids_lod[i] + 1) * ids_count; + r < ids_lod[i + 1] * ids_count; ++r) { + PADDLE_ENFORCE_LT(ids[r], row_number); + PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); + blas.AXPY(row_width, 1., table + ids[r] * row_width, + output + i * last_dim + (r % ids_count) * row_width); + } + } + } +}; + +template +class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + const LoDTensor *table_var = context.Input("W"); + const std::string &combiner_type = context.Attr("combiner"); + + if (combiner_type == "sum") { + EmbeddingVSumFunctor functor; + functor(context, table_var, ids_t, output_t); + } + } +}; + +template +class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTable " + "must be either LoDTensor or SelectedRows"); + } + + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + auto lod = ids->lod()[0]; + int64_t row_width = d_output->dims()[1]; + + framework::Vector *new_rows = d_table->mutable_rows(); + new_rows->resize(ids_num); + std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t)); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + T *d_table_data = d_table_value->mutable_data(context.GetPlace()); + const T *d_output_data = d_output->data(); + + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * row_width; + const T *out_pos = d_output_data + i * row_width; + T *in_pos = d_table_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(row_width, out_pos, in_pos + r * row_width); + } + } + } else { + LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4690bd766d0b8a4b7a249fb5ccad5f278d1830f5 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +class ConvInceptionFusionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + // 1 x + auto in_dims = ctx->GetInputDim("Input"); + // 4 filters + auto w_dims = ctx->GetInputsDim("Filter"); + + PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor."); + PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters"); + PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]); + PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]); + + int n = in_dims[0]; + // compute output channel + // 1st channel + int c = w_dims[0][0]; + // add 2nd channel + c += (w_dims[1][0] - w_dims[2][1] * 2); + // add 3rd channel + c += (w_dims[2][0] - w_dims[3][1]); + // add 4-th channel + c += w_dims[3][0]; + + int h = in_dims[2]; + int w = in_dims[3]; + + ctx->SetOutputDim("Output", {n, c, h, w}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Input")->type(), ctx.device_context()); + } +}; + +class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + void Make() override { + AddInput("Input", "(Tensor) NCHW layout."); + AddInput("Filter", "(vector) 4 aggregated filters").AsDuplicable(); + AddInput("Bias", "(vector) it's lenght is equal to Filter") + .AsDuplicable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); + AddOutput("TempOutput", "").AsDuplicable(); + AddAttr( + "pooling_type", + "(string), pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddAttr( + "exclusive", + "(bool, default True) When true, will exclude the zero-padding in the " + "averaging calculating, otherwise, include the zero-padding. Note, it " + "is only used when pooling_type is avg. The defalut is True.") + .SetDefault(true); + AddAttr( + "activation", + "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " + "'relux' , 'tanh', 'band_pass'") + .SetDefault("relu"); + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. Need set use_cudnn to true." + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + AddComment(R"DOC( +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp, + ops::ConvInceptionFusionOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e13887866485bd114ebf12f4bdfa8d60fca6d01 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -0,0 +1,272 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +DECLARE_uint64(conv_workspace_size_limit); + +namespace paddle { +namespace operators { + +#if CUDNN_VERSION >= 7100 +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using ScopedActivationDescriptor = platform::ScopedActivationDescriptor; +using DataLayout = platform::DataLayout; + +using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; +using PoolingMode = platform::PoolingMode; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + +template +using CudnnDataType = platform::CudnnDataType; + +template +class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto* input = ctx.Input("Input"); + auto filters = ctx.MultiInput("Filter"); + auto bias = ctx.MultiInput("Bias"); + + auto* output = ctx.Output("Output"); + auto temp_outs = ctx.MultiOutput("TempOutput"); + + const std::string pool_type = ctx.Attr("pooling_type"); + const std::string activation = ctx.Attr("activation"); + const bool exclusive = ctx.Attr("exclusive"); + + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + T* temp_data = temp_outs[0]->mutable_data(input->dims(), ctx.GetPlace()); + + DataLayout layout = DataLayout::kNCHW; + std::vector in_dim = framework::vectorize2int(input->dims()); + + // ------------------- cudnn descriptors --------------------- + PoolingMode pooling_mode; + if (pool_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : (PoolingMode::kAverageInclusive); + } + std::vector k0x0 = {0, 0}; + std::vector k1x1 = {1, 1}; + std::vector k1x1_2 = {1, 1}; + std::vector k3x3 = {3, 3}; + ScopedPoolingDescriptor pool_desc; + ScopedActivationDescriptor act_desc; + ScopedTensorDescriptor out_pool_desc; + ScopedTensorDescriptor input_desc; + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1); + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + + cudnnDataType_t cudnn_dtype = CudnnDataType::type; + cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4]; + cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4]; + cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4]; + cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4]; + cudnnConvolutionDescriptor_t* conv_desc = + new cudnnConvolutionDescriptor_t[4]; + for (int i = 0; i < 4; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i])); + } + + std::vector> filter_dims; + std::vector> bias_dims; + std::vector> in_dims; + std::vector> out_dims; + std::vector> in_strides; + std::vector> out_strides; + std::vector> bias_strides; + + cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW; + int n = in_dim[0]; + int h = in_dim[2]; + int w = in_dim[3]; + int oc = output->dims()[1]; + + cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE) + ? CUDNN_DATA_DOUBLE + : CUDNN_DATA_FLOAT; + + for (int i = 0; i < 4; ++i) { + filter_dims.push_back(framework::vectorize2int(filters[i]->dims())); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data())); + bias_dims.push_back({1, filter_dims[i][0], 1, 1}); + bias_strides.push_back({filter_dims[i][0], 1, 1, 1}); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(), + bias_strides[i].data())); + in_dims.push_back({n, filter_dims[i][1], h, w}); + out_dims.push_back({n, filter_dims[i][0], h, w}); + in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1}); + out_strides.push_back({oc * h * w, h * w, w, 1}); + + if (i < 2) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( + conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(), + CUDNN_CROSS_CORRELATION, compute_type)); + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( + conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(), + CUDNN_CROSS_CORRELATION, compute_type)); + } + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + conv_desc[i], CUDNN_DEFAULT_MATH)); + } + in_dims[2][1] *= 2; + in_strides[2][0] = oc * h * w; + out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous. + in_strides[3][0] = filter_dims[2][0] * h * w; + CUDNN_ENFORCE( + platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2)); + + cudnnConvolutionFwdAlgo_t algo[4]; + auto handle = dev_ctx.cudnn_handle(); + size_t workspace_size_in_bytes = 0; // final workspace to allocate. + + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) { + int64_t max_user_size = + std::max(static_cast(FLAGS_conv_workspace_size_limit), + user_workspace_size); + workspace_size_limit = max_user_size * 1024 * 1024; + } + + for (int i = 0; i < 4; ++i) { + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data())); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + out_desc[i], cudnn_dtype, 4, out_dims[i].data(), + out_strides[i].data())); + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, + &algo[i])); + size_t tmp_size = 0; + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], + algo[i], &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + cudnnActivationDescriptor_t cudnn_act_desc = + act_desc.descriptor(activation); + + int oc0 = filter_dims[0][0]; + int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2; + int oc3 = filter_dims[3][0]; + int oc2 = oc - oc0 - oc1 - oc3; + + // branch1: pool + 1x1 conv + ScalingParamType alpha = 1.0f, beta = 0.0f; + CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( + handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, + pool_out_desc, temp_data)); + + std::vector in_datas; + in_datas.push_back(static_cast(temp_data)); + in_datas.push_back(static_cast(input_data)); + in_datas.push_back( + static_cast(output_data + (oc0 + oc1) * h * w)); + T* temp2_data = temp_outs[1]->mutable_data( + framework::make_ddim(out_dims[2]), ctx.GetPlace()); + in_datas.push_back(static_cast(temp2_data + oc2 * h * w)); + + std::vector out_datas; + out_datas.push_back(static_cast(output_data)); + out_datas.push_back(static_cast(output_data + oc0 * h * w)); + out_datas.push_back(static_cast(temp2_data)); + out_datas.push_back( + static_cast(output_data + (oc0 + oc1 + oc2) * h * w)); + + for (int i = 0; i < 4; ++i) { + auto func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, + out_desc[i], out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); + }; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + workspace_handle.RunFunc(func, workspace_size_in_bytes); + } + + cudnnTensorDescriptor_t x_desc; + cudnnTensorDescriptor_t y_desc; + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data())); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data())); + CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( + handle, CudnnDataType::kOne(), x_desc, + static_cast(out_datas[2]), CudnnDataType::kZero(), + y_desc, static_cast(output_data + (oc0 + oc1) * h * w))); + + for (int i = 0; i < 4; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i])); + } + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc)); + } +}; +#endif + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 7100 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion, + ops::CUDNNConvInceptionFusionOpKernel, + ops::CUDNNConvInceptionFusionOpKernel); +#endif diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index a807117115763486a58052a6240cdedba6af9ac8..6ca6f0bc04aa696852ed7338dcb4b88a49b2fc81 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("PreOut"), "Output(PreOut) should not be null."); + auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); + if (with_prefetch) { + PADDLE_ENFORCE(ctx->HasOutput("W_Out"), + "Output(W_Out) should not be null."); + } const int64_t batch_size = ctx->GetInputDim("X")[0]; std::vector output_shape({batch_size, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); @@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "(LoDTensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); - AddInput("PTable", + AddInput("PathTable", "(LoDTensor, optional), The Path Table from root to current word" "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); @@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); + AddOutput( + "W_Out", + "(LoDTensor, optinal) using input 'W' as Output to make it mutable" + "When we are using prefetch") + .AsIntermediate(); AddAttr("num_classes", "(int, optional), The number of classes") .SetDefault(2); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); AddComment(R"DOC( The hierarchical sigmoid operator organize the classes into a binary tree. At each node, a sigmoid function is used to calculate the probability of @@ -189,23 +216,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference << " is set to SelectedRows"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to SelectedRows"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); - } } else { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::LOD_TENSOR); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to LoDTensor"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); - } + } + if (hasBias) { + VLOG(30) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to LoDTensor"; + block->Var(bias_grad_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index d212e6f8437e69e71c010b6af27a33ff5e39e1e1..1a7ca963010112bbcab69f1ceeb9cb8d19ca9b9e 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/platform/transform.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -34,8 +40,9 @@ using platform::Transform; static std::vector PathToRows(const framework::LoDTensor& path) { std::set rows; + const int64_t* paths = path.data(); for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = path.data()[i]; + int64_t row = paths[i]; if (row < 0) { continue; } @@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); auto& label = detail::Ref(ctx.Input("Label")); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); + // for remote prefetch + + auto epmap = ctx.Attr>("epmap"); + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + auto height_sections = ctx.Attr>("height_sections"); + auto table_names = ctx.Attr>("table_names"); + std::vector real_rows = PathToRows(*path); + framework::Scope& local_scope = ctx.scope().NewScope(); + auto* ids = local_scope.Var("Ids@Prefetch"); + auto* x_tensor = ids->GetMutable(); + + x_tensor->mutable_data( + framework::make_ddim({static_cast(real_rows.size()), 1}), + ctx.GetPlace()); + // copy. + + std::memcpy(x_tensor->data(), real_rows.data(), + real_rows.size() * sizeof(int64_t)); + + framework::DDim w_dims = ctx.Input("W")->dims(); + w_dims[0] = x_tensor->dims()[0]; + auto* w_tensor = + local_scope.Var("W@Prefetch")->GetMutable(); + w_tensor->Resize(w_dims); + +#ifdef PADDLE_WITH_DISTRIBUTE + // w_Out is set to used by prefetch, never change it in other cases + auto* w_out = ctx.Output("W_Out"); + operators::distributed::prefetch_with_reconstruct( + "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections, + ctx, local_scope, w_out); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } + bool is_custom = false; if (path) { is_custom = true; @@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); - auto* bias = ctx.Input("Bias"); auto* in_grad = ctx.Output(framework::GradVarName("X")); bool is_sparse = ctx.Attr("is_sparse"); @@ -173,15 +220,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { } // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. - + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } if (!is_sparse) { - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } auto* w_grad = ctx.Output(framework::GradVarName("W")); w_grad->mutable_data(ctx.GetPlace()); @@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); zero(dev_ctx, w_grad_value, static_cast(0.0)); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->set_rows(real_rows); - // build ids -> rows index map - bias_grad->SyncIndex(); - bias_grad->set_height(bias->dims()[0]); - auto* bias_grad_value = bias_grad->mutable_value(); - std::vector dims = {static_cast(real_rows.size()), - bias->dims()[1]}; - bias_grad_value->mutable_data(framework::make_ddim(dims), - ctx.GetPlace()); - zero(dev_ctx, bias_grad_value, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } bit_code->MulGradWeight(pre_out_grad, w_grad, in); } bit_code->MulGradError(pre_out_grad, w, in_grad); diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 69e7fa4490b892373d85898b13b976a474a6096a..f458ce6c83bfcfb56d558409b0802f27f13a4761 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel { "Input(Logits@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Logits"); - auto lab_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index 9efda3dfc9871f197e0a66329772df2caedc4da4..fa21bd01cb052e5393385f0ef6c0203fc7c9e1a3 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); x_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); + residual.unaryExpr(HuberLossBackward(delta, -1.0)); + x_grad.device(place) = out_grad * x_grad; } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenVector::Flatten(*out1); y_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); + residual.unaryExpr(HuberLossBackward(delta, 1.0)); + y_grad.device(place) = out_grad * y_grad; } } }; diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 437005825db7e0718b52ac830dd56ac87069ed39..bde2791add4075be6949703dfbea634966d25c1c 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -190,6 +190,26 @@ void BenchGRUKernel() { } } +template +void BenchSeqPoolKernel() { + std::vector pool_types = { + jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; + for (auto type : pool_types) { + for (int w : TestSizes()) { + jit::seq_pool_attr_t attr(w, type); + for (int h : TestSizes()) { + attr.h = h; + std::vector x(h * w), y(w); + RandomVec(h * w, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.data(); + BenchAllImpls, PlaceType>(attr, x_data, + y_data, &attr); + } + } + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -228,4 +248,7 @@ int main(int argc, char* argv[]) { BenchGRUKernel(); BenchGRUKernel(); BenchGRUKernel(); + + // seq pool function + BenchSeqPoolKernel(); } diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 8a540108302f77e1ca3bfe1db0013d76a22d5eb4..2b8c758a032fd7edff0d4b7e23bd8e685eb3ab15 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1) USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kNCHW16CMulNC) +USE_JITKERNEL_GEN(kSeqPool) diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc new file mode 100644 index 0000000000000000000000000000000000000000..530d24ee1fb7d9da84102641e1d4d2ab08ab1860 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/seqpool.h" +#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void SeqPoolJitCode::genCode() { + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 8; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + int rest_num_regs = num_block % max_num_regs; + mov(reg32_int_h, dword[param_attr]); + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + mov(reg_tmp, reinterpret_cast(exp_float_consts)); + vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]); + mov(reg_tmp, reinterpret_cast(fp_h_)); + fild(dword[param_attr]); + fstp(dword[reg_tmp]); + vmovss(xmm_t(0), ptr[reg_tmp]); + if (type_ == SeqPoolType::kSqrt) { + vsqrtps(xmm_t(0), xmm_t(0)); + } + vdivps(xmm_t(1), xmm_t(1), xmm_t(0)); + vmovss(ptr[reg_tmp], xmm_t(1)); + } + const int group_len = max_num_regs * block * sizeof(float); + for (int g = 0; g < num_groups; ++g) { + pool_height(g * group_len, block, max_num_regs); + } + if (rest_num_regs > 0) { + pool_height(num_groups * group_len, block, rest_num_regs); + } + // part of rest_w * height + const int rest = w_ % block; + pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs); + ret(); +} + +class SeqPoolCreator : public JitCodeCreator { + public: + bool UseMe(const seq_pool_attr_t& attr) const override { + return platform::MayIUse(platform::avx); + } + size_t CodeSize(const seq_pool_attr_t& attr) const override { + return 96 + + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * + 4 /* load, mul and save */ + + 256) * + 8; + } + std::unique_ptr CreateJitCode( + const seq_pool_attr_t& attr) const override { + PADDLE_ENFORCE_GT(attr.w, 0); + PADDLE_ENFORCE_GT(attr.h, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator); diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h new file mode 100644 index 0000000000000000000000000000000000000000..fcbbb3c84c562e2ba57110134bf07bb218b41edb --- /dev/null +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -0,0 +1,214 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class SeqPoolJitCode : public JitCode { + public: + explicit SeqPoolJitCode(const seq_pool_attr_t& attr, + size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) { + if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg || + type_ == SeqPoolType::kSqrt)) { + LOG(FATAL) << "Only support sum pool yet "; + } + fp_h_[0] = 1.f; + this->genCode(); + } + + virtual const char* name() const { + std::string base = "SeqPoolJitCode"; + if (type_ == SeqPoolType::kSum) { + base += "_Sum"; + } else if (type_ == SeqPoolType::kAvg) { + base += "_Avg"; + } else if (type_ == SeqPoolType::kSqrt) { + base += "_Sqrt"; + } + base += ("_W" + std::to_string(w_)); + return base.c_str(); + } + void genCode() override; + + protected: + template + void pool_height(int w_offset, int block, int max_num_regs) { + int offset = w_offset; + for (int i = 0; i < max_num_regs; ++i) { + vmovups(JMM(i), ptr[param_src + offset]); + offset += sizeof(float) * block; + } + cmp(reg32_int_h, 1); + Label l_next_h, l_h_done; + jle(l_h_done, T_NEAR); + mov(reg_h_i, 1); + mov(reg_tmp, param_src); + add(reg_tmp, w_ * sizeof(float) + w_offset); + L(l_next_h); + { + mov(reg_ptr_src_i, reg_tmp); + for (int i = 0; i < max_num_regs; ++i) { + vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]); + // sum anyway + vaddps(JMM(i), JMM(i), JMM(i + max_num_regs)); + add(reg_ptr_src_i, sizeof(float) * block); + } + inc(reg_h_i); + add(reg_tmp, w_ * sizeof(float)); + cmp(reg_h_i, reg32_int_h); + jl(l_next_h, T_NEAR); + } + L(l_h_done); + // save right now + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + mov(reg_tmp, reinterpret_cast(fp_h_)); + vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]); + } + offset = w_offset; + for (int i = 0; i < max_num_regs; ++i) { + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + vmulps(JMM(i), JMM(i), JMM(max_num_regs)); + } + vmovups(ptr[param_dst + offset], JMM(i)); + offset += sizeof(float) * block; + } + } + + void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) { + const int rest_used_num_regs = load_rest(rest, w_offset, 0); + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + cmp(reg32_int_h, 1); + Label l_next_h, l_h_done; + jle(l_h_done, T_NEAR); + mov(reg_h_i, 1); + mov(reg_tmp, param_src); + add(reg_tmp, w_ * sizeof(float) + w_offset); + L(l_next_h); + { + int reg_idx = 0; + mov(reg_ptr_src_i, reg_tmp); + if (has_block4) { + vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + add(reg_ptr_src_i, sizeof(float) * 4); + reg_idx++; + } + if (has_block2) { + vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + add(reg_ptr_src_i, sizeof(float) * 2); + reg_idx++; + } + if (has_block1) { + vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + reg_idx++; + } + PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, + "All heights should use same regs"); + for (int i = 0; i < reg_idx; ++i) { + vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); + } + inc(reg_h_i); + add(reg_tmp, w_ * sizeof(float)); + cmp(reg_h_i, reg32_int_h); + jl(l_next_h, T_NEAR); + } + L(l_h_done); + // save right now + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + mov(reg_tmp, reinterpret_cast(fp_h_)); + vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]); + for (int i = 0; i < rest_used_num_regs; ++i) { + vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs)); + } + } + save_rest(rest, w_offset); + } + + // return the number of used regs, use start from reg 0 + int load_rest(int rest, int w_offset, const int num_shift_regs, + const int reg_start = 0) { + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + int reg_idx = reg_start; + if (has_block4) { + vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); + w_offset += sizeof(float) * 4; + reg_idx++; + } + if (has_block2) { + vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); + w_offset += sizeof(float) * 2; + reg_idx++; + } + if (has_block1) { + vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); + reg_idx++; + } + return reg_idx; + } + + // use reg start from 0 + void save_rest(int rest, int w_offset, int reg_start = 0) { + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + int reg_idx = reg_start; + if (has_block4) { + vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx)); + w_offset += sizeof(float) * 4; + reg_idx++; + } + if (has_block2) { + vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx)); + w_offset += sizeof(float) * 2; + reg_idx++; + } + if (has_block1) { + vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx)); + } + } + + private: + float ALIGN32_BEG fp_h_[1] ALIGN32_END; + int w_; + SeqPoolType type_; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_attr{abi_param3}; + reg64_t reg_tmp{rax}; + + reg32_t reg32_int_h{r8d}; + reg32_t reg32_fp_h{r9d}; + + reg64_t reg_h_i{r10}; + reg64_t reg_ptr_src_i{r11}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index d00584baa081c21762774aef4cbbc714d49cd012..7d02590f2e5d82b5105132d7af716f14c661d067 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -26,6 +26,7 @@ namespace jit { const char* to_string(KernelType kt) { switch (kt) { + ONE_CASE(kNone); ONE_CASE(kVMul); ONE_CASE(kVAdd); ONE_CASE(kVAddRelu); @@ -45,12 +46,26 @@ const char* to_string(KernelType kt) { ONE_CASE(kCRFDecoding); ONE_CASE(kLayerNorm); ONE_CASE(kNCHW16CMulNC); + ONE_CASE(kSeqPool); default: PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; } return nullptr; } + +const char* to_string(SeqPoolType tp) { + switch (tp) { + ONE_CASE(kNonePoolType); + ONE_CASE(kSum); + ONE_CASE(kAvg); + ONE_CASE(kSqrt); + default: + PADDLE_THROW("Not support type: %d, or forget to add it.", tp); + return "NOT PoolType"; + } + return nullptr; +} #undef ONE_CASE KernelType to_kerneltype(const std::string& act) { diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 412df86aa1cd94871989aef25adef803f673812b..fbf34fc4b3db49596b6be0360c00e77c12fab9b8 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -119,6 +119,7 @@ typename KernelTuples::func_type Get( } const char* to_string(KernelType kt); +const char* to_string(SeqPoolType kt); KernelType to_kerneltype(const std::string& act); @@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { << "],act_cand[" << to_string(attr.act_cand) << "]"; return os; } +inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { + os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" + << to_string(attr.type) << "]"; + return os; +} } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index b4a2d5d47301a2fd82bf27ddfaaa31ef23e431c2..2a7697a6f253dcc2b8143d9f14a80a1cfd45996d 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -41,8 +41,16 @@ typedef enum { kCRFDecoding, kLayerNorm, kNCHW16CMulNC, + kSeqPool, } KernelType; +typedef enum { + kNonePoolType = 0, + kSum = 1, + kAvg, + kSqrt, +} SeqPoolType; + template struct XYZNTuples { typedef T data_type; @@ -112,6 +120,21 @@ struct GRUTuples { typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +typedef struct seq_pool_attr_s { + int h, w; // h should always be the first one + SeqPoolType type; + seq_pool_attr_s() = default; + explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1) + : h(height), w(width), type(pool_type) {} +} seq_pool_attr_t; + +template +struct SeqPoolTuples { + typedef T data_type; + typedef seq_pool_attr_t attr_type; + typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); +}; + template struct CRFDecodingTuples { typedef T data_type; diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 4e6a19f04fd425b920aeea49b63001941d800a73..61de38688664f83775c0c4e5aa6f7e06c3602ddb 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -42,6 +42,13 @@ size_t JitCodeKey(const gru_attr_t& attr) { (static_cast(attr.act_cand) << act_type_shift); } +template <> +size_t JitCodeKey(const seq_pool_attr_t& attr) { + size_t key = attr.w; + constexpr int pool_type_shift = 3; + return (key << pool_type_shift) + static_cast(attr.type); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 863cc720d68ce3dcfe045aa11c559a06a50909f3..f5ed2f0572176e42b774259c2b8fe9713d989417 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl) USE_JITKERNEL_MORE(kVExp, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) +USE_JITKERNEL_MORE(kSeqPool, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index a5b088d4812b8a54e3b4fb1cb83d9e8bc7501994..5a499ac2c02aa70d2824f0d3be618e083ba10334 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -72,6 +72,26 @@ void VExp(const double* x, double* y, int n) { platform::dynload::vdExp(n, x, y); } +template <> +void VCopy(const float* x, float* y, int n) { + platform::dynload::cblas_scopy(n, x, 1, y, 1); +} + +template <> +void VCopy(const double* x, double* y, int n) { + platform::dynload::cblas_dcopy(n, x, 1, y, 1); +} + +template <> +void VAXPY(float a, const float* x, float* y, int n) { + platform::dynload::cblas_saxpy(n, a, x, 1, y, 1); +} + +template <> +void VAXPY(double a, const double* x, double* y, int n) { + platform::dynload::cblas_daxpy(n, a, x, 1, y, 1); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool VMulKernel::UseMe(const int& d) const { @@ -103,6 +123,16 @@ bool VTanhKernel::UseMe(const int& d) const { return d > 7; } +template <> +bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { + return true; +} + +template <> +bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { + return true; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(const int& d) const { \ @@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal); REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); +REGISTER_MKL_KERNEL(kSeqPool, SeqPool); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index ee1031c028ff72181f504004b7cbeb9f7ee578f1..0a3816db24ccd0820cb259b40044e1f5b66665f7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/operators/jit/kernel_base.h" @@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n); template void VExp(const T* x, T* y, int n); +template +void VCopy(const T* x, T* y, int n); + +template +void VAXPY(T a, const T* x, T* y, int n); + template void VSigmoid(const T* x, T* y, int n) { const T min = SIGMOID_THRESHOLD_MIN; @@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) { } } +template +void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { + VCopy(x, y, attr->w); + for (int h = 1; h != attr->h; ++h) { + VAXPY(static_cast(1), x + h * attr->w, y, attr->w); + } + if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) { + T scalar = static_cast(1); + if (attr->type == SeqPoolType::kAvg) { + scalar = scalar / static_cast(attr->h); + } else { + scalar = scalar / std::sqrt(static_cast(attr->h)); + } + VScal(&scalar, y, y, attr->w); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelMore> { \ @@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples); DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); DECLARE_MKL_KERNEL(VTanh, XYNTuples); +DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 07497b732050a7299e224531db37eb56e60ef605..0f626bb3bfd2851e3fb6ad8265169f9bb9860851 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2) USE_JITKERNEL_REFER(kCRFDecoding) USE_JITKERNEL_REFER(kLayerNorm) USE_JITKERNEL_REFER(kNCHW16CMulNC) +USE_JITKERNEL_REFER(kSeqPool) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index d196266326b4ee668f647fa51032f6344d26e5c6..85381daa47484a4053326f04e12d583543a423e0 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); +REGISTER_REFER_KERNEL(kSeqPool, SeqPool); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 0fd1b89dfdba9f4655f649fa6d32604188c78da3..b4e9c8dd107ee844544165b1719d38754ae976bc 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { } } +template +void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { + for (int w = 0; w < attr->w; ++w) { + const T* src = x + w; + T* dst = y + w; + *dst = static_cast(0); + for (int h = 0; h < attr->h; ++h) { + *dst = *dst + *src; + src += attr->w; + } + } + if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) { + T scalar = static_cast(1); + if (attr->type == SeqPoolType::kAvg) { + scalar = scalar / static_cast(attr->h); + } else { + scalar = scalar / std::sqrt(static_cast(attr->h)); + } + VScal(&scalar, y, y, attr->w); + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); +DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index a73e2a60aeb0c1594b5072b2bffbd11cccfcdc7d..30291bfef3bc96fe2e687e5be6d782eee89496aa 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -211,6 +211,24 @@ struct TestFuncWithRefer, std::vector, std::vector, } }; +template +struct TestFuncWithRefer, std::vector, + std::vector> { + void operator()(const typename jit::SeqPoolTuples::func_type tgt, + const std::vector& x, const std::vector& yref, + const typename jit::SeqPoolTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size() % yref.size(), 0); + int w = yref.size(); + std::vector y(w); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, &attr); + ExpectEQ(y_data, yref_data, w); + } +}; + template void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { @@ -415,6 +433,31 @@ void TestGRUKernel() { } } +template +void TestSeqPoolKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + std::vector pool_types = { + jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; + for (auto type : pool_types) { + for (int w : TestSizes()) { + jit::seq_pool_attr_t attr(w, type); + for (int h : TestSizes()) { + attr.h = h; + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(h * w), yref(w); + RandomVec(h * w, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* yref_data = yref.data(); + ref(x_data, yref_data, &attr); + VLOG(10) << attr; + TestAllImpls, PlaceType, std::vector, + std::vector>(attr, x, yref, attr); + } + } + } +} + template void TestNCHW16CMulNCKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); @@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) { TestGRUKernel(); } +TEST(JITKernel, kSeqPool) { + namespace jit = paddle::operators::jit; + TestSeqPoolKernel(); + TestSeqPoolKernel(); +} + TEST(JITKernel, kNCHW16CMulNC) { namespace jit = paddle::operators::jit; TestNCHW16CMulNCKernelHasOutput(framework::GradVarName("Emission"))) { ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); + ctx->ShareLoD("Emission", framework::GradVarName("Emission")); } if (ctx->HasOutput(framework::GradVarName("Transition"))) { ctx->SetOutputDim(framework::GradVarName("Transition"), transition_exps_dims); + ctx->ShareLoD("Transition", framework::GradVarName("Transition")); } } diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index e28d199eebc09662da46ba4c2b19409b39bd94d3..c4a2282e16483dbe78a32a4148c5bc4349dde3dc 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -38,13 +38,13 @@ class LoadCombineOp : public framework::OperatorBase { static_cast(out_var_names.size()), 0, "The number of output variables should be greater than 0."); if (!model_from_memory) { - std::ifstream fin(filename); + std::ifstream fin(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load_combine op", filename); LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } else { PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); - std::stringstream fin(filename); + std::stringstream fin(filename, std::ios::in | std::ios::binary); LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } } diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 06773d1d0ed6707418a02f7eef2865f3c11de224..4bce4eba22e4a8900f8d12454fd233e17c9ad617 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -34,7 +34,7 @@ class LoadOp : public framework::OperatorBase { // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. auto filename = Attr("file_path"); - std::ifstream fin(filename); + std::ifstream fin(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", filename); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 9d248e03218b83a65b9786cb317aafbe3dbb67ee..ef1fb83aa6e34c14637b6e761fd7d2dbadee36b8 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel { "Output(Predicted@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 6a0d6bad512fe7cc15e60ed25028bc3cbbbca2ab..fd15539f7b6727496988c9b13d0d2551659a420a 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 3a73a7637c6d7d3eff7443802a4a52be9149e0ef..a7d0fd4856edc74237151c64f286d468ad86e7ca 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 0a18882e8199c2a375a230a693b8b01d12aabfa0..4e4f977fcc742856b877ef0b7f9a3cc9879aefce 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,8 +50,8 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); @@ -132,8 +132,8 @@ template class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); PADDLE_ENFORCE( diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ea6aebd291eee580a307aa112117434fa942005e..600ab14d37aad6b95516c5bd6551d12165596f57 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -51,7 +51,7 @@ math_library(pooling) math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(sequence2batch) math_library(sequence_padding) -math_library(sequence_pooling DEPS math_function) +math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d35073029a3440d8a17e383ce97fcfc582663888..58f7be12ce6b5d447e93cf86c4954a86fccf48ef 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -62,27 +62,19 @@ struct CUBlas { cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const float *beta, void *C, cudaDataType_t Ctype, int ldc) { - // Because the gcc 4.8 doesn't expand template parameter pack that - // appears in a lambda-expression, I can not use template parameter pack - // here. - auto cublas_call = [&]() { +// Because the gcc 4.8 doesn't expand template parameter pack that +// appears in a lambda-expression, I can not use template parameter pack +// here. #if CUDA_VERSION >= 8000 - VLOG(5) << "use_tensor_op_math: " - << (platform::TensorCoreAvailable() ? "True" : "False"); + VLOG(5) << "use_tensor_op_math: " + << (dev_ctx->tensor_core_available() ? "True" : "False"); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( - dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc)); + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc)); + }); #else - PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); -#endif - }; - -#if CUDA_VERSION >= 9000 - // NOTES: To use Tensor Core, we should change the cublas config, - // but the cublas may be hold by multi-thread. - dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); -#else - cublas_call(); + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); #endif } }; @@ -170,32 +162,24 @@ struct CUBlas { cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType) { - auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = platform::TensorCoreAvailable(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); #else - PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); -#endif - }; - -#if CUDA_VERSION >= 9000 - // NOTES: To use Tensor Core, we should change the cublas config, - // but the cublas may be hold by multi-thread. - dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); -#else - cublas_call(); + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); #endif } }; @@ -223,9 +207,10 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, CUDA_R_32F, N); } else { #endif // CUDA_VERSION >= 8000 - - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, A, lda, &beta, C, N); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N); + }); #if CUDA_VERSION >= 8000 } @@ -266,9 +251,12 @@ inline void Blas::GEMM( CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &h_alpha, h_B, ldb, h_A, lda, - &h_beta, h_C, N); + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, + N); + }); #endif // CUDA_VERSION >= 8000 } @@ -292,8 +280,10 @@ void Blas::GEMM(bool transA, bool transB, int M, } else { #endif // CUDA_VERSION >= 8000 - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, A, lda, &beta, C, ldc); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc); + }); #if CUDA_VERSION >= 8000 } @@ -311,16 +301,19 @@ inline void Blas::GEMM( cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, - ldc); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, ldc); + }); } template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - CUBlas::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); } template <> @@ -330,8 +323,9 @@ void Blas::GEMV(bool trans_a, int M, int N, T beta, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, - &beta, C, 1); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); } template <> @@ -353,28 +347,28 @@ void Blas::BatchedGEMM( #if CUDA_VERSION >= 9010 if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto cublas_call = [&]() { - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = platform::TensorCoreAvailable(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, - CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, - CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); - }; - auto &dev_ctx = const_cast(context_); - dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, + strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, + strideC, batchCount, CUDA_R_32F, algo)); + }); } else { #endif // CUDA_VERSION >= 9010 - CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, strideB, A, lda, - strideA, &beta, C, ldc, strideC, batchCount); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, strideB, A, lda, strideA, &beta, C, + ldc, strideC, batchCount); + }); #if CUDA_VERSION >= 9010 } diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index 895a7019aa10e5d9bb8f0c17e433a4344eac3bf4..d1127ce4a246136cdd1385ef09d905efe63178d8 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -37,9 +37,6 @@ void Transpose::operator()( for (int i = 0; i < Rank; i++) { permute[i] = axis[i]; } - auto in_dim = in.dims(); - auto out_dim = out->dims(); - auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); auto* dev = context.eigen_device(); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index d55e832cc2d9a4a5e2cb7fe5cf451a1205601951..d6f51c6e5c693becb14ff0bac0088bb9dc2b2f55 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -84,41 +84,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, code_table_.apply_visitor(func); } -template -struct MatrixBitCodeFunctorSelectedRowsAddGrad - : public boost::static_visitor { - const framework::Tensor &tmat_; - framework::SelectedRows *vec_; - - MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) - : tmat_(tmat), vec_(vec) {} - - template - void operator()(const CodeTable &code_table) { - size_t batch_size = tmat_.dims()[0]; - size_t width = tmat_.dims()[1]; - auto *vec_data = vec_->mutable_value()->template data(); - auto *tmat_data = tmat_.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table.get_code(i); - int code_length = code.get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); - int64_t row_index = vec_->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; - } - } - } -}; - -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) { - MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); - code_table_.apply_visitor(func); -} - template struct MatrixBitCodeFunctorSum : public boost::static_visitor { const framework::Tensor &tmat_; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 01e4889d34ad6e409f1b8a9c4bf783800187e863..c399cb5d44aaa50fab00fd170c021c8c70eee990 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -124,11 +124,12 @@ class SimpleCode { template class CustomCode { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) { - seq_len_ = ptable.dims()[1]; - ptable_data_ = ptable.data() + seq_len_ * index; - pcode_data_ = pcode.data() + seq_len_ * index; + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, + int index) { + seq_len_ = path_table.dims()[1]; + path_table_data_ = path_table.data() + seq_len_ * index; + path_code_data_ = path_code.data() + seq_len_ * index; } /** * Here the id of root should be 1 rather than 0, thus the encoding of class c @@ -139,25 +140,25 @@ class CustomCode { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_data_[bit]; } - bool calc_bit(int bit) const { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return path_table_data_[bit]; } + bool calc_bit(int bit) const { return path_code_data_[bit]; } // NOTE: this function is not thread-safe. int get_length() const { if (length_ < 0) { auto len = seq_len_; - length_ = - static_cast(std::find_if(ptable_data_, ptable_data_ + len, - [](const T& val) { return val < 0; }) - - ptable_data_); + length_ = static_cast( + std::find_if(path_table_data_, path_table_data_ + len, + [](const T& val) { return val < 0; }) - + path_table_data_); } return length_; } private: int64_t seq_len_; - const T* ptable_data_; - const T* pcode_data_; + const T* path_table_data_; + const T* path_code_data_; mutable int length_{-1}; }; @@ -181,9 +182,9 @@ class SimpleCodeTable { template class CustomCodeTable { public: - CustomCodeTable(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : ptable_(ptable), pcode_(pcode), ids_(ids) {} + CustomCodeTable(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : ptable_(path_table), pcode_(path_code), ids_(ids) {} CustomCode get_code(int64_t code) const { return CustomCode(ptable_, pcode_, ids_, code); @@ -210,11 +211,11 @@ class MatrixBitCodeFunctor { ids_(ids), code_table_(SimpleCodeTable(num_classes, ids)) {} - MatrixBitCodeFunctor(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : num_classes_(static_cast(ptable.dims()[1])), + MatrixBitCodeFunctor(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -225,11 +226,6 @@ class MatrixBitCodeFunctor { */ void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); - /* For selected rows For j < code_length - vec(0, index(i, j)) += tmat(i, j) - */ - void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec); - /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 6d491dbf1ed162ef07fda4c07e95cc57108486fd..2a47502614b9cd3df4583992669ab4bf78228181 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" @@ -239,15 +240,33 @@ class SequencePoolFunctor { last_pool(context, input, output); return; } - if (pooltype == "FIRST") { math::FirstSeqPoolFunctor first_pool; first_pool(context, input, output); return; } + auto lod = input.lod()[0]; + if (pooltype == "SUM") { + auto place = context.GetPlace(); + PADDLE_ENFORCE(platform::is_cpu_place(place)); + const T* src = input.data(); + T* dst = output->mutable_data(place); + jit::seq_pool_attr_t attr( + static_cast(input.numel() / input.dims()[0]), + jit::SeqPoolType::kSum); + auto seqpool = + jit::Get, platform::CPUPlace>( + attr); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + attr.h = static_cast(lod[i + 1] - lod[i]); + seqpool(src, dst, &attr); + dst += attr.w; + src += attr.h * attr.w; + } + return; + } auto& place = *context.eigen_device(); - auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { Tensor in_t = input.Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -258,15 +277,6 @@ class SequencePoolFunctor { auto out_e = EigenVector::Flatten(out_t); if (pooltype == "AVERAGE") { out_e.device(place) = in_e.mean(Eigen::array({{0}})); - } else if (pooltype == "SUM") { - if (h > 0) { - const T* in_data = in_t.data(); - T* out_data = out_t.mutable_data(context.GetPlace()); - blas.VCOPY(w, in_data, out_data); - for (int64_t r = 1; r != h; ++r) { - blas.AXPY(w, 1., in_data + r * w, out_data); - } - } } else if (pooltype == "SQRT") { out_e.device(place) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 9e99e44822b2fce971b751967ca8076a1f1384ec..1d9d98b10646af9e199f6c481740d30745888707 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -76,7 +76,6 @@ class SoftmaxFunctor> { void operator()(const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { auto in_dims = X->dims(); - auto out_dims = Y->dims(); const float* in_data = X->data(); float* out_data = Y->data(); const int kBatchDim = 0; diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc index 35db4c1ad1f6c6481eca397e99fc8c1f0bc7164c..9954e51083b2c4dbc043fe82ee75be91c6d60128 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cc +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { "Input(Out@Grad) must not be null."); auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 271428408cb26296ff318bb39414ad0e8ecc0ac8..05afdf53240484212901febee431cef2b35bb75c 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) should not be null"); auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - auto x_mat_dims = framework::flatten_to_2d( - x_dims, ctx->Attrs().Get("x_num_col_dims")); - auto y_mat_dims = framework::flatten_to_2d( - y_dims, ctx->Attrs().Get("y_num_col_dims")); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 06c35c789f8e6ae754f4e5909c466ae3166ce58b..256da34912560ddf1f7e430e8543efe00e5885bc 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("Input"); auto label_dims = ctx->GetInputDim("Label"); - auto w_dims = ctx->GetInputDim("Weight"); PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; if (ctx->HasInput("Bias")) { @@ -154,6 +153,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("is_sparse", "(boolean, default false) Sparse update.") .SetDefault(false); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + AddAttr>("custom_neg_classes", "This attribute only be used in unitest. Classes " "in this list wiil be used as negative classes " @@ -223,24 +240,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); - auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to SelectedRows"; block->Var(weight_grad) ->SetType(framework::proto::VarType::SELECTED_ROWS); - block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to LoDTensor"; block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); - block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); - block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType()); } }; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index f2ca6ec247fd1ea09b707c2eaaad0548c8aa5757..2c97eef096eb3d23273e362e658cb1b5fc808609 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -43,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context, auto label = context.Input("Label"); const int64_t *label_data = label->data(); auto label_dims = label->dims(); - // int num_total_classes = context.Attr("num_total_classes"); // for unitest std::vector custom_neg_classes = context.Attr>("custom_neg_classes"); @@ -144,15 +149,82 @@ class NCEKernel : public framework::OpKernel { } // forward mul auto input_mat = EigenMatrix::From(*(context.Input("Input"))); - auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - Eigen::Tensor result = - (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * - weight_mat.chip(sample_labels_data[i], 0)) - .sum(); - sample_out_data[i] += result(0); - sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + + std::vector labels; + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + labels.push_back(sample_labels_data[i]); + } + std::set st(labels.begin(), labels.end()); + labels.assign(st.begin(), st.end()); + + framework::Scope &local_scope = context.scope().NewScope(); + + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + auto *ids = local_scope.Var("Ids@Prefetch"); + auto *x_tensor = ids->GetMutable(); + x_tensor->mutable_data( + framework::make_ddim({static_cast(labels.size()), 1}), + context.GetPlace()); + // copy. + std::memcpy(x_tensor->data(), labels.data(), + labels.size() * sizeof(int64_t)); + + std::vector w_dims = paddle::framework::vectorize2int( + context.Input("Weight")->dims()); + w_dims[0] = static_cast(labels.size()); + + auto *w_tensor = local_scope.Var("Weight@Prefetch") + ->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); + +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch", + table_names, epmap, height_sections, + context, local_scope); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + + auto weight_mat = EigenMatrix::From( + (local_scope.Var("Weight@Prefetch")->Get())); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + std::vector::iterator it = + std::find(labels.begin(), labels.end(), sample_labels_data[i]); + int idx = std::distance(labels.begin(), it); + + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(idx, 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } + context.scope().DeleteScope(&local_scope); + } else { + auto weight_mat = + EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } } + // forward cost for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { out_data[i] = 0; @@ -240,18 +312,19 @@ class NCEGradKernel : public framework::OpKernel { sample_grad_data[i] *= d_out_data[sample_idx]; } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T *d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + bool is_sparse = context.Attr("is_sparse"); if (!is_sparse) { - // get d_bias - auto d_bias = context.Output(framework::GradVarName("Bias")); - if (d_bias != nullptr) { - T *d_bias_data = d_bias->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; - } - } // get d_w auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { @@ -273,34 +346,6 @@ class NCEGradKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto *bias_var = context.InputVar("Bias"); - DDim bias_dim; - if (bias_var->IsType()) { - bias_dim = context.Input("Bias")->dims(); - } else if (bias_var->IsType()) { - auto *table_t = context.Input("Bias"); - bias_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter Bias of a NCE_OP " - "must be either LoDTensor or SelectedRows"); - } - - auto d_bias = - context.Output(framework::GradVarName("Bias")); - d_bias->set_rows(labels); - d_bias->set_height(bias_dim[0]); - - d_bias->mutable_value()->Resize( - {static_cast(labels.size()), bias_dim[1]}); - T *d_bias_data = - d_bias->mutable_value()->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + labels.size(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[d_bias->Index(sample_labels_data[i])] += - sample_grad_data[i]; - } - auto *table_var = context.InputVar("Weight"); DDim table_dim; if (table_var->IsType()) { diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index 8e7457dd56c2413f84008ce467537e07b3e80cc7..2a479081f1e40a4bdc3d80067e4a7d8ebc2bf550 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -23,5 +23,7 @@ limitations under the License. */ #include "ops/binary_unnary_op.h" #include "ops/fill_constant_op.h" +#include "ops/mean_op.h" #include "ops/mul_op.h" +#include "ops/scale_op.h" #include "ops/top_k_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h index 6610380fcf432d0019f7e844fa9304e151b20efd..0c0d25d0cd1ae536618057ce80388b8eeb81c68a 100644 --- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h +++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -48,4 +47,3 @@ static void BuildUnaryNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8f5092963c8b79501ea68c1f521c4678977635ea --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h @@ -0,0 +1,59 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +template +std::shared_ptr ElementwiseScalar( + float scale, std::shared_ptr node) { + auto node_shape = node->get_shape(); + auto scale_const = ngraph::op::Constant::create(node->get_element_type(), + node_shape, {scale}); + return std::make_shared(scale_const, node); +} + +template +std::shared_ptr ElementwiseScalar( + std::shared_ptr scale_1d, + std::shared_ptr node) { + auto scale_shape = scale_1d->get_shape(); + PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node"); + PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}"); + + auto node_shape = node->get_shape(); + ngraph::AxisSet axis_set; + for (size_t i = 0; i < node_shape.size(); ++i) { + axis_set.insert(i); + } + node_shape.push_back(1); + + auto scale_bcast = + std::make_shared(scale_1d, node_shape, axis_set); + + auto scale_reshape = + paddle::platform::NgReshaper(scale_bcast, node->get_shape()); + + return std::make_shared(scale_reshape, node); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h index 5eff69e7b165fa19c775926914b7b3e8fcb043e5..406a4314f89810df192280cc97de245553d5520f 100644 --- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h +++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -58,4 +57,3 @@ void BuildFillConstantNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4c44bc4c112f401c2707f7babd49a33f238a768f --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/mean_op.h @@ -0,0 +1,66 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildMeanNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map); + ngraph::AxisSet axes; + for (size_t i = 0; i < input->get_shape().size(); ++i) { + axes.insert(i); + } + + auto mean = ngraph::builder::mean(input, axes); + auto mean_1d = std::make_shared( + mean, ngraph::AxisVector{}, ngraph::Shape{1}); + paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map); +} + +void BuildMeanGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto x_shape = x->get_shape(); + float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1, + std::multiplies()); + auto node_const = ngraph::op::Constant::create(og->get_element_type(), + ngraph::Shape{1}, {x_size}); + auto node_div = std::make_shared(og, node_const); + + auto result = ElementwiseScalar( + og / node_const, + ngraph::op::Constant::create(og->get_element_type(), x_shape, {0})); + paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h index 9e12e5d7c3da04706907c7ae63ce8046ce667f25..4a6cbebe245f891c6c33b2116330a41d89d50e25 100644 --- a/paddle/fluid/operators/ngraph/ops/mul_op.h +++ b/paddle/fluid/operators/ngraph/ops/mul_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -131,4 +130,3 @@ static void BuildMulGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h new file mode 100644 index 0000000000000000000000000000000000000000..91a57d0be606373e985a30b7ac9c73648062d8e4 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/scale_op.h @@ -0,0 +1,39 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildScaleNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + float scale = op_attrs.Get("scale"); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto out = ElementwiseScalar(scale, x); + paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index 2b7254497c0e1aab2e653e69e6461f262b929703..ea66953a125860ab1ce8309819b6c433ff32eaaa 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -48,4 +47,3 @@ void BuildTopKNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index d0224177ecf7f0c918def08ff4dd6a3c8eb349d8..6c95d3f3bf3a3b0448a8f39915f8b025f7d3bd46 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel { out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); - auto ndim = out_norm->dims(); T eps = static_cast(ctx.Attr("epsilon")); int axis = ctx.Attr("axis"); if (axis < 0) axis = xdim.size() + axis; diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h index 6c616aa03d9809e9b7725a700c7edd5ff5d6dc42..3f51bb0b3d6ddf41a08a64f254f76c88b60ced22 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ b/paddle/fluid/operators/optimizers/adadelta_op.h @@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 9f6ef391696aa8718be71ae945e746b876813d94..13455fc42cdc72a8ebfcac3dc0c94b79497d91f6 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto *param_out_tensor = ctx.Output("ParamOut"); auto *moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1138bb7400e0e7a00983e7bfaad2b2d9704b77ab..61b9384f8422cb531a94096875434ffe36ecdbce 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -347,7 +347,8 @@ class AdamOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; @@ -423,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel { } } + framework::SelectedRows cpu_grad_merge; const framework::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor + framework::SelectedRows* grad_merge_var; scatter::MergeAdd merge_func; - auto* grad_merge_var = const_cast(ctx.scope()) - .Var() - ->GetMutable(); + if (platform::is_cpu_place(ctx.GetPlace())) { + grad_merge_var = &cpu_grad_merge; + } else { + // FIXME(qiao): GPU also need to fix this + grad_merge_var = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + } merge_func(ctx.template device_context(), grad, grad_merge_var, true); grad_merge_ptr = grad_merge_var; diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h index 7137fbd9651b4523f6d1609a0595b30758aa40df..55d25ecbddf175c0c9ba2c68ef2f6c7b83dcf32e 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ b/paddle/fluid/operators/optimizers/adamax_op.h @@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h index 5df43d33ef9f720fd20d57c53ff37cc85440b24e..4abd436927707f1a18039c9104a92b2a0bf3c982 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h @@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 8f812c9a037bfac8c1e29e32a5ad5b077c8153d1..bbf34d8316b09a78c334b0d79b132639be8af4f7 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index f6ef83c3bad23d709b386f8e75bbc97fa9ba0aab..3ed1bff5ff4993e9c858dea8d56a8cb6124aca89 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -395,7 +395,7 @@ class MomentumOpKernel : public framework::OpKernel { PADDLE_THROW( string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " "gradient, but the received Variable Type is %s", - grad_var->Type().name())); + framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a9d303d55d8f681fe3a014db36ede5ef6b2742bd..975e4b8e7212bc61d5df0ca350fcf12afb463cba 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 0a9a29956affedb8605ab9949070943fbbb54145..f6f40b1daf4b6e5502190aaaab6b976fc960bcda 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -71,7 +72,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -130,20 +130,25 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, padding_right_bottom); } - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), input_format); + + mkldnn::memory::data_type dt = + paddle::framework::ToMKLDNNDataType(input->type()); + + auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format); /* create memory descriptor for pooling without specified format * ('any') which lets a primitive (pooling in this case) choose * the memory format preferred for best performance */ - auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, - mkldnn::memory::format::any); - + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any); + auto propagation = src_md.data.data_type == mkldnn_f32 + ? mkldnn::prop_kind::forward_training + : mkldnn::prop_kind::forward_scoring; std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, - padding_right_bottom, ksize, pooling_type, - mkldnn_engine, ceil_mode, is_test); + CreatePrimitiveDesc(src_md, dst_md, propagation, strides, + padding_left_top, padding_right_bottom, ksize, + pooling_type, mkldnn_engine, ceil_mode, is_test); // save pool_pd into global device context to be referred in backward path if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); @@ -203,7 +208,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { private: std::unique_ptr CreatePrimitiveDesc( const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, - const std::vector& stride, const std::vector& padding_left_top, + const mkldnn::prop_kind& propagation, const std::vector& stride, + const std::vector& padding_left_top, const std::vector& padding_right_bot, const std::vector& kernel, const std::string& pooling_type, const mkldnn::engine& engine, bool ceil_mode, bool is_test) const { @@ -411,6 +417,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::PoolMKLDNNOpKernel); + ops::PoolMKLDNNOpKernel, + ops::PoolMKLDNNOpKernel, + ops::PoolMKLDNNOpKernel); + REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 1a424728f7f6c4034242fb998d5121804e38702b..5666613f6efb99ec484d110857b8067a8f3b2ae5 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; auto in_stride = framework::stride(in_dims); - auto roi_stride = framework::stride(rois->dims()); auto out_stride = framework::stride(out->dims()); const T* input_data = in->data(); diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 7fc07efe7304701794595c9fa63f4a306d61e230..56879ffda5d3e04a88d12d6c4701c24a0d0ee4f7 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -49,7 +49,7 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, + int batch_size, size_t thread_num, const std::vector& slots, const std::vector& file_list) : batch_size_(batch_size), slots_(slots), file_list_(file_list) { diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index a0b9fa305d85efdb7f2d6972fefd5660d3fba9ff..d0edcc170f0afbccdcdf83eed9a167b7602e34ab 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -49,7 +49,7 @@ class SaveCombineOp : public framework::OperatorBase { } MkDirRecursively(DirName(filename).c_str()); - std::ofstream fout(filename); + std::ofstream fout(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", filename); diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index e1c9fd8ff1f08de2f8078309a1a37f79e1c3d401..fcc598f4f16138b4cc13c7b9bb59e79d80cf3596 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -80,7 +80,7 @@ class SaveOp : public framework::OperatorBase { // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. - std::ofstream fout(filename); + std::ofstream fout(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", filename); @@ -122,7 +122,7 @@ class SaveOp : public framework::OperatorBase { // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. - std::ofstream fout(filename); + std::ofstream fout(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", filename); framework::SerializeToStream(fout, selectedRows, dev_ctx); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h index 03b59d71cc0ca2eddd1d9912e7ca25348507ba03..4bded0efb9674f368a3139841f9340c55567da1a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h @@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { set_zero(ctx.template device_context(), x_grad, static_cast(0)); - auto out_grad_stride = framework::stride(out_grad->dims()); - for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { Tensor out_grad_t = out_grad->Slice(static_cast(out_lod[0][i]), diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 767449cde981e5925b7144ff1038560c67651f3e..5ede972c71ff3ef8ff00756b97662aabb54d6349 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { } auto *mask_data = cpu_mask->data(); - std::vector> copy_ranges(mask_dim[0]); + std::vector> copy_ranges(2); // set out_true/out_false lod for (size_t t = 0; t < 2; t++) { diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c3d83a06f23a34ec8cf27d585863135ebfd56a4f..6a99ad9a90f69ba3c96fc18dc46dfcadcb6ac631 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& dst_stride, T* dst) { paddle::operators::detail::StridedCopyDimVisitor func( dev_ctx, src, src_stride, dst_stride, dst); - boost::apply_visitor(func, dst_dim); + dst_dim.apply_visitor(func); } // Strided numel memory copy from src to dst by the specified axis diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce..c39f94637a1abb5bfce9a5428419282f2b870c91 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 4f717a43551d6d79292bd1d49664d35588a8ec3a..01996e6bf975227270914aa6bec26aacfc814c94 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel { PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", - x_vars[0]->Type().name()); + framework::ToTypeName(x_vars[0]->Type())); } }; diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 76cc796a9b8e21849b1d86e512cd70752fd027ac..a8b2df186dbfcb2a913e9532e2a475f1ad0d23a1 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index d1dff16ddd859e6bf19ec22420c28819a9f14d50..1f51b5bab3068cc89bffa85de28a9438359659f3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) +cc_library(timer SRCS timer.cc) +cc_test(timer_test SRCS timer_test.cc DEPS timer) + cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) @@ -97,7 +100,7 @@ ENDIF() nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) if(WITH_GPU) - nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) + nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) else() - cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) + cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) endif() diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..122de72e15d587cf33b5d9856ac8b1243f666881 --- /dev/null +++ b/paddle/fluid/platform/cuda_helper.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/macros.h" + +#if CUDA_VERSION < 9000 +enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; +#endif + +namespace paddle { +namespace platform { + +class CublasHandleHolder { + public: + CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { + PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); +#if CUDA_VERSION >= 9000 + if (math_type == CUBLAS_TENSOR_OP_MATH) { + PADDLE_ENFORCE( + dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); + } +#endif + } + + ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } + + template + inline void Call(Callback &&callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); + + cublasHandle_t handle_; + mutable std::mutex mtx_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 022afb686b29c2c493cfd05600ee372470cbc710..09f3d3de54e4388f7090621a0fead96b3043d918 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -245,8 +245,15 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); + + if (TensorCoreAvailable()) { +#if CUDA_VERSION >= 9000 + cublas_tensor_core_handle_.reset( + new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH)); +#endif + } + if (dynload::HasCUDNN()) { cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } @@ -285,7 +292,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) if (dynload::HasCUDNN()) { auto local_cudnn_version = cudnn_dso_ver / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; - if (local_cuda_version < compile_cuda_version) { + if (local_cudnn_version < compile_cudnn_version) { LOG_FIRST_N(WARNING, 1) << "WARNING: device: " << place_.device << ". The installed Paddle is compiled with CUDNN " @@ -306,7 +313,8 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); WaitStreamCallback(); - PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + cublas_handle_.reset(); + cublas_tensor_core_handle_.reset(); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -335,8 +343,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -cublasHandle_t CUDADeviceContext::cublas_handle() const { - return cublas_handle_; +bool CUDADeviceContext::tensor_core_available() const { + return cublas_tensor_core_handle_ != nullptr; } cudnnHandle_t CUDADeviceContext::cudnn_handle() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 7e875801893f3b73f8efaf33af690f8c855beee4..c81d17380cf894631d06588c007c2e11ce5c7836 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/temporary_allocator.h" #ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" @@ -209,39 +210,6 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; -#if CUDA_VERSION >= 9000 -class ScopedCublasMathMode { - public: - ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) - : handle_(handle) { - need_reset = false; - PADDLE_ENFORCE( - platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), - "Failed to get old cublas math mode"); - if (old_math_mode_ != new_math_mode) { - PADDLE_ENFORCE( - platform::dynload::cublasSetMathMode(handle_, new_math_mode), - "Failed to set old cublas math mode"); - need_reset = true; - } - } - - ~ScopedCublasMathMode() { - if (need_reset) { - PADDLE_ENFORCE( - platform::dynload::cublasSetMathMode(handle_, old_math_mode_), - "Failed to set old cublas math mode"); - } - } - - private: - cublasHandle_t handle_; - cublasMath_t old_math_mode_; - bool need_reset; -}; - -#endif - class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -262,8 +230,25 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle() const; + /*! \brief Call cublas function safely. */ + template + inline void CublasCall(Callback&& callback) const { + cublas_handle_->Call(std::forward(callback)); + } + + /*! \brief Check whether tensor core is supported */ + bool tensor_core_available() const; + + /*! \brief Call cublas function with Tensor Core safely. If + Tensor Core is not available, use DEFAULT_MATH instead. */ + template + inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { + if (cublas_tensor_core_handle_) { + cublas_tensor_core_handle_->Call(std::forward(callback)); + } else { + cublas_handle_->Call(std::forward(callback)); + } + } /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; @@ -282,7 +267,6 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { - std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -294,18 +278,6 @@ class CUDADeviceContext : public DeviceContext { void WaitStreamCallback() const { callback_manager_->Wait(); } -#if CUDA_VERSION >= 9000 - /*! \brief CublasCall may need to change cublas's config, - * but the cublas may be hold by multi-thread, so we should - * add lock here. */ - template - void CublasCall(Callback callback, cublasMath_t new_math) { - std::lock_guard guard(cublas_mtx_); - ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); - callback(); - } -#endif - private: CUDAPlace place_; @@ -313,7 +285,9 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_stream_; std::unique_ptr cudnn_holder_; cudaStream_t stream_; - cublasHandle_t cublas_handle_; + + std::unique_ptr cublas_handle_; + std::unique_ptr cublas_tensor_core_handle_; int compute_capability_; int runtime_version_; @@ -321,12 +295,10 @@ class CUDADeviceContext : public DeviceContext { int multi_process_; int max_threads_per_mp_; - mutable std::mutex mtx_; - // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; - mutable std::mutex cublas_mtx_; + DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; template <> diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 171d2979a0218ad5e22112190a59866b3e0b617f..5b3aa98efb46b51d6c3edb6d2cbd4200bd0a35c6 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -43,9 +43,6 @@ TEST(Device, CUDADeviceContext) { ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); - cublasHandle_t cublas_handle = device_context->cublas_handle(); - ASSERT_NE(nullptr, cublas_handle); - ASSERT_NE(nullptr, device_context->stream()); delete device_context; } } diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index f3cd3b2bbedef7c9140c2acddea0732972ff7fa0..91d9a1ef013449e83f2540a6646c96e34347ccc1 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_R6 +CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#endif + #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 990e44cd211c001c436dce8ff74a89a5516b38ae..15d516836652ea4ea4d1bcdf35022e6b79cc3b52 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,6 +53,12 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; +#endif + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -173,6 +181,8 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07f848356e801be95d53a61bb5b08e37..06680539507886becb751d02cb02abbf702f1948 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,21 +258,49 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + +#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ + ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); + +#define __PADDLE_THROW_ON_ERROR(COND, ...) \ + __PADDLE_THROW_ERROR_I( \ + __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define __PADDLE_ENFORCE_I(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ } \ - } while (false) + } while (0) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG +#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args +#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) + #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index d52182965552e9ec945cb7d0b421d8addcb758e9..1091badae54a809c4a9da6d0398bcbb538420af0 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) { HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); } EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok at all"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok at all")); + } + EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_NE(std::string(error.what()).find(" at "), 0); + } + EXPECT_TRUE(caught_exception); } TEST(ENFORCE, NO_ARG_OK) { diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index 27e930e6e0a76982b3f27619f38a4a08d82cafa1..3a937dfaec3acc7c116f0077694e9aee1b379061 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index e2b7ca9b03809113c31af8ff4d3ad3713748f330..b1b51d804e02f233bcd16149005092dc80e9c79d 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 584df85e80203c383a89954aac73dd1dcd723f7c..b3d20736a8e70d2f57ee5d6dc97cb490b5cfee44 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -145,7 +145,8 @@ class MKLDNNHandler { const std::shared_ptr user_memory_p, const std::string& suffix, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -159,8 +160,20 @@ class MKLDNNHandler { std::shared_ptr reorder_p; if (mpd != user_mpd) { target_memory_p = std::make_shared(mpd); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + std::shared_ptr reorder_p; + if (is_INT8) { + mkldnn::primitive_attr + attri; // attribute for int8 weights and bias data reorder. + attri.set_output_scales(mask, scale_data); + + auto reorder_pd = std::shared_ptr( + new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri)); + reorder_p = std::shared_ptr(new mkldnn::reorder( + *reorder_pd, *user_memory_p, *target_memory_p)); + } else { + reorder_p = std::make_shared(*user_memory_p, + *target_memory_p); + } dev_ctx_.SetBlob(key_reorder_p, reorder_p); pipeline.push_back(*reorder_p); } @@ -182,22 +195,58 @@ class MKLDNNHandler { return dims2str(operand_dims) + suffix; } - template + template static void SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, std::vector dst_tz, const mkldnn::engine& engine, std::shared_ptr& dst_pd, // NOLINT std::shared_ptr& dst_memory) { // NOLINT - M* output_data = output->mutable_data(ctx.GetPlace()); + T* output_data = output->mutable_data(ctx.GetPlace()); auto dst_md = platform::MKLDNNMemDesc( {dst_tz}, paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType), + framework::DataTypeTrait::DataType), mkldnn::memory::format::nhwc); dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); - dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + } + + static void AppendKey( + std::string* key, const mkldnn::memory::dims& input_dims, + const mkldnn::memory::dims& weights_dims, const std::vector& strides, + const std::vector& paddings, const std::vector& dilations, + const int& groups, const mkldnn::memory::data_type& srcdt, + const mkldnn::memory::format& format, + const mkldnn::memory::data_type& dstdt, const std::string& suffix) { + AppendKeyDims(key, input_dims); + AppendKeyDims(key, weights_dims); + AppendKeyVec(key, strides); + AppendKeyVec(key, paddings); + AppendKeyVec(key, dilations); + AppendKey(key, std::to_string(groups)); + AppendKey(key, std::to_string(srcdt)); + AppendKey(key, std::to_string(format)); + AppendKey(key, std::to_string(dstdt)); + AppendKey(key, suffix); } protected: + static void AppendKeyDims(std::string* key, + const mkldnn::memory::dims& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKeyVec(std::string* key, const std::vector& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKey(std::string* key, const std::string& s) { + key->append(s); + } + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { @@ -215,7 +264,8 @@ class MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler { public: - TransposeMKLDNNHandler(std::vector& dims, std::vector& axis, + TransposeMKLDNNHandler(std::vector& dims, // NOLINT + std::vector& axis, // NOLINT const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), @@ -303,8 +353,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { } protected: - mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, - std::vector& axis) { + mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT + std::vector& axis // NOLINT + ) { mkldnn_memory_desc_t mem_fmt; mem_fmt.primitive_kind = mkldnn_memory; @@ -462,21 +513,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); - return this->AcquireMemory(weights_pd, user_weights_pd, - user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent); + return this->AcquireMemory( + weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", + pipeline, is_persistent, is_INT8, scale_data, mask); } std::shared_ptr AcquireBiasMemoryFromPrimitive( const std::shared_ptr user_bias_memory_p, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, + int mask = 0) { // NOLINT auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); auto bias_pd = conv_pd_->bias_primitive_desc(); return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline); + "@bias_mem_p", pipeline, is_persistent, is_INT8, + scale_data, mask); } std::shared_ptr AcquireConvolution( @@ -594,5 +650,29 @@ using ConvTransposeMKLDNNHandler = ConvMKLDNNTemplateHandler; + +template +static std::shared_ptr SetDstMemory( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + return dst_memory_p; +} + +template +static std::shared_ptr SetDstMemoryHandler( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p; + dst_memory_p->set_data_handle(to_void_cast(output_data)); + return dst_memory_p; +} } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6ce4bf8f13922e2756c3ee8f189bd36123d6964c..8df8e32098697540f02d488c873f5ae7fb29828e 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -106,7 +106,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1) { + if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 998242fb4a09138db24aa75759f4990ffdc4d4e2..85977366e61c676fc5d2d3c5d22dd2f606543684 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/port.h" - #include #include #include @@ -25,9 +22,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA + #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; + std::lock_guard l(profiler_mu); + is_enabled_ = true; dev_ctx_ = dev_ctx; name_ = name; @@ -184,8 +185,8 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; + std::lock_guard l(profiler_mu); DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index e4e5be5b89f4cbecd6b5e9deec9cc5bffa6a4917..35d1d929819c41b213bc51ec24ac725021a76c88 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -14,12 +14,27 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include +#include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" + DECLARE_double(limit_of_temporary_allocation); namespace paddle { namespace platform { +class DummyOp : public framework::OperatorBase { + public: + DummyOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override {} +}; + TEST(temporary_allocator, temporary_allocator) { platform::CPUPlace cpu_place; TemporaryAllocator alloc(cpu_place); @@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) { } TEST(temporary_allocator, create_tensor_with_allocationptr) { - platform::CPUPlace cpu_place; - TemporaryAllocator cpu_alloc(cpu_place); + framework::VariableNameMap dummy_vars; + framework::AttributeMap dummy_attrs; + DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); + framework::Scope scope; + framework::VariableValueMap vars; + framework::RuntimeContext run_ctx(vars, vars); + size_t memory_size = 300; { - size_t memory_size = 200; - auto allocation = cpu_alloc.Allocate(memory_size); - void* address = allocation->ptr(); + platform::CPUPlace cpu_place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(cpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + int numel = memory_size / sizeof(float); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - { - size_t memory_size = 300; - auto allocation = gpu_alloc.Allocate(memory_size); - void* address = allocation->ptr(); + platform::CUDAPlace gpu_place(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(gpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); int numel = memory_size / sizeof(float); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } - - // The allocation is not holded now, it should be placed to - // TemporaryAllocationQueue. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); #endif } TEST(temporary_allocator, create_tensor_with_allocationptr2) { - platform::CPUPlace cpu_place; - TemporaryAllocator cpu_alloc(cpu_place); + framework::VariableNameMap dummy_vars; + framework::AttributeMap dummy_attrs; + DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); + framework::Scope scope; + framework::VariableValueMap vars; + framework::RuntimeContext run_ctx(vars, vars); + size_t memory_size = 400; { - size_t memory_size = 400; + platform::CPUPlace cpu_place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(cpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; - void* address; { - auto allocation = cpu_alloc.Allocate(memory_size); - address = allocation->ptr(); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); out_side_tensor.ShareDataWith(tensor); } - PADDLE_ENFORCE_EQ(address, out_side_tensor.data()); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); } #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); { - void* address; + platform::CUDAPlace gpu_place(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(gpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + size_t memory_size = 500; int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; { - auto allocation = gpu_alloc.Allocate(memory_size); - address = allocation->ptr(); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); out_side_tensor.ShareDataWith(tensor); } - PADDLE_ENFORCE_EQ(address, out_side_tensor.data()); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); - // The allocation is holded by out_side_tensor. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); } - - // The allocation is not holded now, it should be placed to - // TemporaryAllocationQueue. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); #endif } diff --git a/paddle/fluid/platform/timer.cc b/paddle/fluid/platform/timer.cc new file mode 100644 index 0000000000000000000000000000000000000000..75d4e5cbf90bd81c73756605eacc6b0c15a63e9d --- /dev/null +++ b/paddle/fluid/platform/timer.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace platform { + +void Timer::Reset() { + _start.tv_sec = 0; + _start.tv_usec = 0; + + _count = 0; + _elapsed = 0; + _paused = true; +} + +void Timer::Start() { + Reset(); + Resume(); +} + +void Timer::Pause() { + if (_paused) { + return; + } + _elapsed += Tickus(); + ++_count; + _paused = true; +} + +void Timer::Resume() { + gettimeofday(&_start, NULL); + _paused = false; +} + +int Timer::Count() { return _count; } + +double Timer::ElapsedUS() { return static_cast(_elapsed); } + +double Timer::ElapsedMS() { return _elapsed / 1000.0; } + +double Timer::ElapsedSec() { return _elapsed / 1000000.0; } + +int64_t Timer::Tickus() { + gettimeofday(&_now, NULL); + return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L + + (_now.tv_usec - _start.tv_usec); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h new file mode 100644 index 0000000000000000000000000000000000000000..56019ae7cf21c15c10b1f9247c9d95deb2a48c43 --- /dev/null +++ b/paddle/fluid/platform/timer.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/port.h" + +#ifdef _WIN32 +static unsigned sleep(unsigned seconds) { + Sleep(seconds * 1000); + return 0; +} +#endif + +namespace paddle { +namespace platform { + +// A Standard Timer implementation for debugging +class Timer { + public: + // a timer class for profiling + // Reset() will be called during initialization + // all timing variables will be set 0 in Reset() + Timer() { Reset(); } + void Reset(); + void Start(); + void Pause(); + // Resume will get current system time + void Resume(); + int Count(); + // return elapsed time in us + double ElapsedUS(); + // return elapsed time in ms + double ElapsedMS(); + // return elapsed time in sec + double ElapsedSec(); + + private: + struct timeval _start; + struct timeval _now; + int _count; + int _elapsed; + bool _paused; + + // get us difference between start and now + int64_t Tickus(); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/timer_test.cc b/paddle/fluid/platform/timer_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..09edf8131ffa5c1dfe607b7d72627b225c4452fa --- /dev/null +++ b/paddle/fluid/platform/timer_test.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/platform/timer.h" +#include "gtest/gtest.h" + +TEST(Timer, Reset) { + paddle::platform::Timer timeline; + timeline.Start(); + sleep(3); + timeline.Pause(); + timeline.Reset(); +} + +TEST(Timer, Start) { + paddle::platform::Timer timeline; + timeline.Start(); + sleep(3); + timeline.Pause(); +} + +TEST(Timer, Pause) { + paddle::platform::Timer timeline; + timeline.Start(); + sleep(3); + timeline.Pause(); +} + +TEST(Timer, Resume) { + paddle::platform::Timer timeline; + timeline.Start(); + sleep(3); + timeline.Pause(); + timeline.Resume(); +} diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb8bcb190bda59e23d118547f451be46c963cce9..72b0f216d3aafbb95931590935b0bf967a8d5be8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index f8ded9f94ecaf3df1e14aead60ae12abcf8c34a9..06d8b65fb1480d9f621ca937c1d66ab7e910f010 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -49,9 +49,6 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); - op_proto_and_checker_maker.def( - "kOpCreationCallstackAttrName", - framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index be63fb877869b64035207342e5d4398e481dbc99..5c1c7478f4dbe4c78f5ac2c19f4eae09abbf1c8b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/tracer.h" namespace paddle { @@ -24,13 +23,10 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block, - framework::BlockDesc *startup_block) { - new (&self) imperative::Tracer(root_block, startup_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); }) - .def("trace", &imperative::Tracer::Trace) - .def("get_scope", &imperative::Tracer::GetScope, - pybind11::return_value_policy::reference); + .def("trace", &imperative::Tracer::Trace); } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 88a2a5276ab52e25b4f790e3a2f1386ed0715b4e..dce755c91a58d3291d740bd05c1cf835cbfbf1f0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope_pool.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" @@ -83,11 +84,15 @@ bool IsCompiledWithCUDA() { } bool IsCompiledWithBrpc() { -#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) - return true; -#else +#ifndef PADDLE_WITH_DISTRIBUTE + return false; +#endif + +#ifdef PADDLE_WITH_GRPC return false; #endif + + return true; } bool IsCompiledWithDIST() { @@ -117,20 +122,42 @@ PYBIND11_MODULE(core, m) { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); - py::class_(m, "VarBase", R"DOC()DOC") - .def(py::init<>()) + m.add_object("_cleanup", + py::capsule([]() { ScopePool::Instance().Clear(); })); + + py::class_>( + m, "VarBase", R"DOC()DOC") + // .def(py::init<>()) + .def(py::init(), py::arg("stop_gradient") = false) .def("_run_backward", - [](imperative::VarBase &self, framework::Scope *scope) { - self.RunBackward(scope); - }) + [](imperative::VarBase &self) { self.RunBackward(); }) + .def("_grad_name", &imperative::VarBase::GradName) .def("_grad", &imperative::VarBase::Grad) + .def_property("grad_value", + [](const imperative::VarBase &self) { return self.grads_; }, + [](imperative::VarBase &self, framework::Variable *grad) { + self.grads_ = grad; + }, + py::return_value_policy::reference) + .def_property("value", + [](const imperative::VarBase &self) { return self.var_; }, + [](imperative::VarBase &self, framework::Variable *var) { + self.var_ = var; + }, + py::return_value_policy::reference) .def_property( "desc", [](const imperative::VarBase &self) { return self.var_desc_; }, [](imperative::VarBase &self, framework::VarDesc *var_desc) { self.var_desc_ = var_desc; }, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def_property( + "stop_gradient", + [](const imperative::VarBase &self) { return self.stop_gradient_; }, + [](imperative::VarBase &self, bool stop_gradient) { + self.stop_gradient_ = stop_gradient; + }); py::class_(m, "OpBase", R"DOC()DOC") .def(py::init<>()) @@ -454,7 +481,7 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::copy); - py::class_(m, "Scope", R"DOC( + py::class_(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. Variables in a parent scope can be retrieved from local scope. @@ -474,17 +501,26 @@ All parameter, weight, gradient are variables in Paddle. param.set(param_array, place) )DOC") + .def("_remove_from_pool", + [](Scope &self) { ScopePool::Instance().Remove(&self); }) .def("var", [](Scope &self, const std::string &name) -> Variable * { return self.Var(name); }, py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::return_value_policy::reference) - .def(py::init<>()) .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, py::return_value_policy::reference) .def("drop_kids", &Scope::DropKids); + m.def("Scope", + []() -> Scope * { + auto *s = new Scope(); + ScopePool::Instance().Insert(std::unique_ptr(s)); + return s; + }, + py::return_value_policy::reference); + //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. m.def("get_all_op_protos", []() -> std::vector { @@ -910,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is STR, debug_graphviz_path indicate the path that writing the SSA Graph to file in the form of graphviz, you. It is useful for debugging. Default "")DOC") - .def_property( - "enable_data_balance", - [](const BuildStrategy &self) { return self.enable_data_balance_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); - self.enable_data_balance_ = b; - }) // FIXME(chengudo): enable_data_balance seems not important .def_property( "enable_sequential_execution", [](const BuildStrategy &self) { @@ -971,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle. "memory_optimize", [](const BuildStrategy &self) { return self.memory_optimize_; }, [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "is_distribution", + [](const BuildStrategy &self) { return self.is_distribution_; }, + [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) .def_property( "memory_early_delete", [](const BuildStrategy &self) { return self.memory_early_delete_; }, diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index a2eec6e3c48dd126614bbff0227145537b678ac4..0b94b60018aac3a61edfda4d7ecb762e9fe70673 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { template std::string Sprintf(const Args&... args) { std::ostringstream oss; - Fprintf(oss, ""); + Fprintf(oss, "%s", args...); return oss.str(); } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 418dc1346840f707f38836e09a14ca2c19c063ff..50b7a631297b150ac9d25c036d21b0bdf2854b79 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - #================================================= # Utils #================================================= @@ -200,6 +199,7 @@ function cmake_gen() { -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON} -DPY_VERSION=${PY_VERSION:-2.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} + -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -233,7 +233,8 @@ EOF -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\ -DPY_VERSION=${PY_VERSION:-2.7} \ - -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \ + -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} } @@ -418,13 +419,6 @@ EOF else ctest --output-on-failure fi - - # make install should also be test when unittest - make install -j `nproc` - pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then - paddle version - fi fi } @@ -455,7 +449,7 @@ EOF elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi @@ -535,6 +529,18 @@ function assert_api_spec_approvals() { fi fi + pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl + CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py` + if [ "True" != ${CHECK_DOCK_MD5} ]; then + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ + python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "You must have shanyi15 approval for the api doc change! " + exit 1 + fi + echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt + fi } @@ -922,6 +928,7 @@ function main() { ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} + assert_api_spec_approvals ;; test_inference) gen_capi_package @@ -946,6 +953,15 @@ function main() { run_test assert_api_not_changed ${PYTHON_ABI:-""} ;; + cmake_gen) + cmake_gen ${PYTHON_ABI:-""} + ;; + gen_fluid_lib) + gen_fluid_lib + ;; + test_fluid_lib) + test_fluid_lib + ;; *) print_usage exit 0 diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index ef43d13e18698748717dff35c85b243edec44592..47c5248b57d1946f4b4db30d6cdf60dfd3aee0bd 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -28,20 +28,53 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } + + std::vector envs; + std::vector undefok; +#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) + envs.push_back("max_body_size"); +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - new_argv.push_back( - strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); + envs.push_back("fraction_of_gpu_memory_to_use"); + envs.push_back("allocator_strategy"); #elif __clang__ - new_argv.push_back( - strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_" - "mb,allocator_strategy")); - new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); + envs.push_back("use_mkldnn"); + envs.push_back("initial_cpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + + undefok.push_back("use_mkldnn"); + undefok.push_back("initial_cpu_memory_in_mb"); #else - new_argv.push_back( - strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_" - "mb,allocator_strategy")); - new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); + envs.push_back("use_pinned_memory"); + envs.push_back("use_mkldnn"); + envs.push_back("initial_cpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + + undefok.push_back("use_mkldnn"); + undefok.push_back("initial_cpu_memory_in_mb"); #endif + + if (envs.size() > 0) { + std::string env_string = "--tryfromenv="; + for (auto t : envs) { + env_string += t + ","; + } + env_string = env_string.substr(0, env_string.length() - 1); + new_argv.push_back(strdup(env_string.c_str())); + VLOG(1) << "gtest env_string:" << env_string; + } + + if (undefok.size() > 0) { + std::string undefok_string = "--undefok="; + for (auto t : undefok) { + undefok_string += t + ","; + } + undefok_string = undefok_string.substr(0, undefok_string.length() - 1); + new_argv.push_back(strdup(undefok_string.c_str())); + VLOG(1) << "gtest undefok_string:" << undefok_string; + } + int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8f3660ca387ba10309f829bd04ac2ffdc573f3d6..2c17716500ababfab3216a5ec47fecca30065ff1 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -46,7 +46,7 @@ from . import transpiler from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope +from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor @@ -102,13 +102,6 @@ def __bootstrap__(): import sys import os import platform - - if os.name == 'nt': - third_lib_path = os.path.abspath(os.path.dirname( - __file__)) + os.sep + '..' + os.sep + 'libs' - os.environ['path'] += ';' + third_lib_path - sys.path.append(third_lib_path) - from . import core in_test = 'unittest' in sys.modules @@ -135,7 +128,8 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', + 'enable_parallel_graph' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') @@ -151,12 +145,17 @@ def __bootstrap__(): read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_disable_reuse_port') + if core.is_compiled_with_brpc(): + read_env_flags.append('max_body_size') + #set brpc max body size + os.environ['FLAGS_max_body_size'] = "2147483647" if core.is_compiled_with_cuda(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', + 'sync_nccl_allreduce' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index af02721eb72c1d0f8aa3d7ab8db504c4c33b64d5..c280ff21eec8d1a90b8be9102d7eae119f38f2b1 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -272,8 +272,7 @@ class DataFeeder(object): dict: the result of conversion. Raises: - ValueError: If drop_last is False and the data batch which cannot - fit for devices. + ValueError: If drop_last is False and the data batch which cannot fit for devices. """ def __reader_creator__(): diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f2886090d75f87654b33cf7aa6f98ebf6f2e27d1..5a9e908b61eeeea3fdfdfcc54d1f150f59a3973b 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True): assert isinstance(name, str) if scope is None: scope = global_scope() - assert isinstance(scope, core.Scope) + assert isinstance(scope, core._Scope) var = scope.find_var(name) assert var is not None, ( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3427fb0c4ae4dad0323b39cf85bfb5b04f796996..70767c962f551bdf3afea2237000a4cf93feb120 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -15,18 +15,24 @@ from __future__ import print_function import collections +from collections import defaultdict import contextlib import os import re import six -import sys -import traceback import numpy as np from .. import compat as cpt from .proto import framework_pb2 try: + if os.name == 'nt': + import sys + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core except ImportError as e: if os.name == 'nt': @@ -368,19 +374,26 @@ class Variable(object): if _in_imperative_mode(): self._ivar = core.VarBase() self._ivar.desc = self.desc + self._ivar.stop_gradient = stop_gradient def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) - tensor = core.get_variable_tensor(scope, self.desc.name()) + tensor = self._ivar.value.get_tensor() return np.array(tensor) def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) - self._ivar._run_backward(scope) + self._ivar._run_backward() def _gradient(self): return np.array(self._ivar._grad()) + @property + def _value(self): + return self._ivar.value + + @_value.setter + def _value(self, v): + self._ivar.value = v + def __str__(self): return self.to_string(True) @@ -424,6 +437,14 @@ class Variable(object): """ self.desc = input + @property + def _stop_gradient(self): + return self._ivar.stop_gradient + + @_stop_gradient.setter + def _stop_gradient(self, s): + self._ivar.stop_gradient = s + @property def persistable(self): return self.desc.persistable() @@ -605,10 +626,6 @@ class Operator(object): if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: del op_attrs[role_var_name] - callstack_var_name = op_maker.kOpCreationCallstackAttrName() - op_attrs[callstack_var_name] = list( - reversed(traceback.format_stack()))[1:] - if len(self.desc.type()) != 0: return if type is None: @@ -653,20 +670,16 @@ class Operator(object): self.desc.set_input(in_proto.name, []) if outputs is not None: - given = set() - need = set() - for n in outputs: - given.add(n) for m in proto.outputs: - need.add(m.name) - if not given == need: - raise ValueError(("Incorrect setting for output(s) of " - "operator \"%s\". Need: [%s] Given: [%s]") % - (type, - ", ".join(six.binary_type(e) for e in need), - ", ".join(six.binary_type(e) for e in given))) - + if (m.name not in outputs) and m.dispensable: + continue + if not ((m.name in outputs) or m.dispensable): + raise ValueError( + ("Incorrect setting for output(s) of " + "operator \"%s\", should set: [%s].") % (type, m.name)) for out_proto in proto.outputs: + if out_proto.name not in outputs: + continue out_args = outputs[out_proto.name] if not isinstance(out_args, list): out_args = [out_args] @@ -691,26 +704,28 @@ class Operator(object): self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() + if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + if _in_imperative_mode(): self.iop = core.OpBase() self.iop.desc = self.desc - self.inputs = [] + self.inputs = defaultdict(list) if inputs is not None: - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - self.outputs = [] + for k, v in six.iteritems(inputs): + if isinstance(v, Variable): + self.inputs[k].append(v._ivar) + elif isinstance(v, list) or isinstance(v, tuple): + self.inputs[k].extend([var._ivar for var in v]) + self.outputs = defaultdict(list) if outputs is not None: - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) + for k, v in six.iteritems(outputs): + if isinstance(v, Variable): + self.outputs[k].append(v._ivar) + elif isinstance(v, list) or isinstance(v, tuple): + self.outputs[k].extend([var._ivar for var in v]) def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET @@ -1276,13 +1291,22 @@ class Block(object): Operator: the append Operator. """ op_desc = self.desc.append_op() - op = Operator(block=self, desc=op_desc, *args, **kwargs) - if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + op = Operator( + block=self, + desc=op_desc, + type=kwargs.get("type", None), + inputs=kwargs.get("inputs", None), + outputs=kwargs.get("outputs", None), + attrs=kwargs.get("attrs", None)) self.ops.append(op) + self._trace_op(op, kwargs.get("stop_gradient", False)) return op + def _trace_op(self, op, stop_gradient=False): + if _in_imperative_mode(): + _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc, + stop_gradient) + def _insert_op(self, index, *args, **kwargs): """ Insert a Operator according to the giving arguments. @@ -1328,11 +1352,15 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() - op = Operator(self, op_desc, *args, **kwargs) - if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + op = Operator( + self, + op_desc, + type=kwargs.get("type", None), + inputs=kwargs.get("inputs", None), + outputs=kwargs.get("outputs", None), + attrs=kwargs.get("attrs", None)) self.ops.insert(0, op) + self._trace_op(op, kwargs.get("stop_gradient", False)) return op def _sync_with_cpp(self): @@ -1646,8 +1674,8 @@ class Program(object): parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need to print. - Returns - (str): The debug string. + Returns: + str : The debug string. Raises: ValueError: If any of required fields is not set and throw_on_error is diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py index 922308b6b18b335535d41f24d544cde04991b794..54dc794ea6392fac6f266477fe045b37001a8666 100644 --- a/python/paddle/fluid/imperative/__init__.py +++ b/python/paddle/fluid/imperative/__init__.py @@ -20,6 +20,10 @@ from .base import * from . import layers from .layers import * +from . import nn +from .nn import * + __all__ = [] __all__ += layers.__all__ __all__ += base.__all__ +__all__ += nn.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index aa48ef71aa61086764019ac29abd9cb4c53325fa..c04dcc7e39be9946b561fd725647e87d7712d8b9 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,8 +28,7 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc, - startup.current_block().desc) + tracer = core.Tracer(train.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): @@ -46,8 +45,7 @@ def to_variable(value, block=None): name=None, shape=value.shape, dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) - var = scope.var(py_var.name) + var = py_var._ivar.value tensor = var.get_tensor() tensor.set(value, core.CPUPlace()) return py_var diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 044717c31975d671818cae17cd989774c96ed9fa..d78d61eb3f02c27ec44806ae52e134068c2cb9be 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -24,26 +24,21 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): - def __init__(self): - self._built = False - - def __call__(self, inputs): - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - py_var = base.to_variable(x) - var_inputs.append(py_var) - if not self._built: - self._build_once(inputs) - self._built = True - - outputs = self.forward(var_inputs) - return outputs + def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None): + self._once_built = False + self._dtype = dtype def _build_once(self, inputs): pass - def forward(self, inputs): - return [] + def __call__(self, *inputs): + if not self._once_built: + self._build_once(*inputs) + self._once_built = True + + outputs = self.forward(*inputs) + + return outputs + + def forward(self, *inputs): + raise NotImplementedError diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py new file mode 100644 index 0000000000000000000000000000000000000000..4f30417e99d21bcb66dacaab0257816c4d77f932 --- /dev/null +++ b/python/paddle/fluid/imperative/nn.py @@ -0,0 +1,250 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from six.moves import reduce + +from .. import core +from ..layers import utils +from . import layers +from ..framework import Variable, OpProtoHolder +from ..param_attr import ParamAttr +from ..initializer import Normal, Constant + +__all__ = [ + 'Conv2D', + 'Pool2D', + 'FC', +] + + +class Conv2D(layers.PyLayer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + use_cudnn=True, + act=None, + param_attr=None, + bias_attr=None, + name=None, + dtype=core.VarDesc.VarType.FP32): + assert param_attr is not False, "param_attr should not be False here." + super(Conv2D, self).__init__(name=name, dtype=dtype) + + from ..layer_helper import LayerHelper + self._helper = LayerHelper( + type(self).__name__, + param_attr=param_attr, + bias_attr=bias_attr, + dtype=dtype, + name=name) + + self._groups = groups + self._stride = utils.convert_to_list(stride, 2, 'stride') + self._padding = utils.convert_to_list(padding, 2, 'padding') + self._dilation = utils.convert_to_list(dilation, 2, 'dilation') + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + self._use_cudnn = use_cudnn + self._num_channels = num_channels + if (self._num_channels == self._groups and + num_filters % self._num_channels == 0 and not self._use_cudnn): + self._l_type = 'depthwise_conv2d' + else: + self._l_type = 'conv2d' + + if groups is None: + num_filter_channels = num_channels + else: + if num_channels % groups != 0: + raise ValueError("num_channels must be divisible by groups.") + num_filter_channels = num_channels // groups + filter_size = utils.convert_to_list(filter_size, 2, 'filter_size') + filter_shape = [num_filters, int(num_filter_channels)] + filter_size + + def _get_default_param_initializer(): + filter_elem_num = filter_size[0] * filter_size[1] * num_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std, 0) + + self._filter_param = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=filter_shape, + dtype=self._dtype, + default_initializer=_get_default_param_initializer()) + + if self._use_cudnn: + self._helper.create_variable( + name="kCUDNNFwdAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + self._helper.create_variable( + name="kCUDNNBwdDataAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + self._helper.create_variable( + name="kCUDNNBwdFilterAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + + self._bias_param = self._helper.create_parameter( + attr=self._helper.bias_attr, + shape=[num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type=self._l_type, + inputs={ + 'Input': input, + 'Filter': self._filter_param, + }, + outputs={"Output": pre_bias}, + attrs={ + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups, + 'use_cudnn': self._use_cudnn, + 'use_mkldnn': False, + }) + + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + + return self._helper.append_activation(pre_act) + + +class Pool2D(layers.PyLayer): + def __init__(self, + pool_size=-1, + pool_type="max", + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + name=None, + dtype=core.VarDesc.VarType.FP32): + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if global_pooling is False and pool_size == -1: + raise ValueError( + "When the global_pooling is False, pool_size must be passed " + "and be a valid value. Received pool_size: " + str(pool_size)) + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + super(Pool2D, self).__init__(name=name, dtype=dtype) + + from ..layer_helper import LayerHelper + self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name) + + self._pool_type = pool_type + self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') + self._pool_padding = utils.convert_to_list(pool_padding, 2, + 'pool_padding') + self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride') + self._global_pooling = global_pooling + self._use_cudnn = use_cudnn + self._ceil_mode = ceil_mode + self._exclusive = exclusive + self._l_type = 'pool2d' + + def forward(self, input): + pool_out = self._helper.create_variable_for_type_inference(self._dtype) + + self._helper.append_op( + type=self._l_type, + inputs={"X": input}, + outputs={"Out": pool_out}, + attrs={ + "pooling_type": self._pool_type, + "ksize": self._pool_size, + "global_pooling": self._global_pooling, + "strides": self._pool_stride, + "paddings": self._pool_padding, + "use_cudnn": self._use_cudnn, + "ceil_mode": self._ceil_mode, + "use_mkldnn": False, + "exclusive": self._exclusive, + }) + return pool_out + + +class FC(layers.PyLayer): + def __init__(self, + size, + param_attr=None, + num_flatten_dims=1, + dtype=core.VarDesc.VarType.FP32): + super(FC, self).__init__() + self._size = size + self._num_flatten_dims = num_flatten_dims + self._dtype = dtype + from ..layer_helper import LayerHelper + self._helper = LayerHelper('FC', param_attr=param_attr) + + def _build_once(self, input): + input_shape = input.shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) + ] + [self._size] + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, input): + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": input, + "Y": self._w}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": [tmp]}, + outputs={"Out": out}, + attrs={"use_mkldnn": False}) + return out diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 26d1f8f4d2bd67a35c4ec96a025ee273cec4dbd1..8a2cd4a9290ab1d2f3e2af2d682994c999d7b931 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -162,7 +162,8 @@ class ConstantInitializer(Initializer): "dtype": int(var.dtype), "value": float(self._value), 'force_cpu': self._force_cpu or force_init_on_cpu() - }) + }, + stop_gradient=True) var.op = op return op @@ -231,7 +232,8 @@ class UniformInitializer(Initializer): "min": self._low, "max": self._high, "seed": self._seed - }) + }, + stop_gradient=True) if var.dtype == VarDesc.VarType.FP16: block.append_op( @@ -309,7 +311,8 @@ class NormalInitializer(Initializer): "std": self._std_dev, "seed": self._seed, "use_mkldnn": False - }) + }, + stop_gradient=True) if var.dtype == VarDesc.VarType.FP16: block.append_op( @@ -371,7 +374,8 @@ class TruncatedNormalInitializer(Initializer): "mean": self._mean, "std": self._std_dev, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op @@ -461,7 +465,8 @@ class XavierInitializer(Initializer): "min": -limit, "max": limit, "seed": self._seed - }) + }, + stop_gradient=True) else: std = np.sqrt(2.0 / float(fan_in + fan_out)) @@ -474,7 +479,8 @@ class XavierInitializer(Initializer): "mean": 0.0, "std": std, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op @@ -559,7 +565,8 @@ class MSRAInitializer(Initializer): "min": -limit, "max": limit, "seed": self._seed - }) + }, + stop_gradient=True) else: std = np.sqrt(2.0 / float(fan_in)) @@ -572,7 +579,8 @@ class MSRAInitializer(Initializer): "mean": 0.0, "std": std, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db6b69d4d256e1f7b36eb53524269bb1..ea9953f5814207c569e799c2ffa11758e31699aa 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -20,10 +20,10 @@ import six import sys import numpy as np -from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating +from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode from . import unique_name +from paddle.fluid.imperative import base as imperative_base from paddle.fluid.initializer import Constant, Xavier -from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -50,7 +50,7 @@ class LayerHelper(object): return default_startup_program() def to_variable(self, x): - return base.to_variable(x, self.main_program.current_block()) + return imperative_base.to_variable(x, self.main_program.current_block()) def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) @@ -313,11 +313,20 @@ class LayerHelper(object): param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param - - self.startup_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True)) - return self.main_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs()) + if _in_imperative_mode(): + # In imperative mode, we want the returned parameter to be + # initialized so that it can be used imperatively. + return self.main_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + else: + self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + return self.main_program.global_block().create_parameter( + dtype=dtype, shape=shape, **attr._to_kwargs()) def get_parameter(self, name): param = self.main_program.global_block().var(name) @@ -369,13 +378,16 @@ class LayerHelper(object): def set_variable_initializer(self, var, initializer): assert isinstance(var, Variable) - self.startup_program.global_block().create_var( - name=var.name, - type=var.type, - dtype=var.dtype, - shape=var.shape, - persistable=True, - initializer=initializer) + if imperative_base.enabled(): + initializer(var, var.block) + else: + self.startup_program.global_block().create_var( + name=var.name, + type=var.type, + dtype=var.dtype, + shape=var.shape, + persistable=True, + initializer=initializer) def append_bias_op(self, input_var, dim_start=1, dim_end=None): """ diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 9d98e8333ba07ac3eed3a3b63adcba1919cb4694..a7494aaceab42332cb4362ab1df43d9e0b139f4f 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1452,6 +1452,7 @@ class DynamicRNN(object): def step_input(self, x): """ Mark a sequence as a dynamic RNN input. + Args: x(Variable): The input sequence. @@ -1505,6 +1506,7 @@ class DynamicRNN(object): """ Mark a variable as a RNN input. The input will not be scattered into time steps. + Args: x(Variable): The input variable. @@ -1629,13 +1631,11 @@ class DynamicRNN(object): Args: init(Variable|None): The initialized variable. - shape(list|tuple): The memory shape. NOTE the shape does not contain - batch_size. + shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size. value(float): the initalized value. - need_reorder(bool): True if the initialized memory depends on the - input sample. + need_reorder(bool): True if the initialized memory depends on the input sample. dtype(str|numpy.dtype): The data type of the initialized memory. @@ -1714,6 +1714,7 @@ class DynamicRNN(object): """ Update the memory from ex_mem to new_mem. NOTE that the shape and data type of :code:`ex_mem` and :code:`new_mem` must be same. + Args: ex_mem(Variable): the memory variable. new_mem(Variable): the plain variable generated in RNN block. diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index ce731f39ea099a4d8948812989ad19b3cce119ff..8aed97dc59b100d4e37832e0a148d73662742ba0 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred, rpn_negative_overlap=0.3, use_random=True): """ - ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** + **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.** This layer can be, for given the Intersection-over-Union (IoU) overlap between anchors and ground truth boxes, to assign classification and @@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred, Examples: .. code-block:: python - bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], - append_batch_size=False, dtype='float32') - cls_logits = layers.data(name='cls_logits', shape=[100, 1], - append_batch_size=False, dtype='float32') - anchor_box = layers.data(name='anchor_box', shape=[20, 4], - append_batch_size=False, dtype='float32') - gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], - append_batch_size=False, dtype='float32') - loc_pred, score_pred, loc_target, score_target, bbox_inside_weight = - fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, - cls_logits=cls_logits, - anchor_box=anchor_box, - gt_boxes=gt_boxes) + bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], + append_batch_size=False, dtype='float32') + cls_logits = layers.data(name='cls_logits', shape=[100, 1], + append_batch_size=False, dtype='float32') + anchor_box = layers.data(name='anchor_box', shape=[20, 4], + append_batch_size=False, dtype='float32') + gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], + append_batch_size=False, dtype='float32') + loc_pred, score_pred, loc_target, score_target, bbox_inside_weight = + fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, + cls_logits=cls_logits, + anchor_box=anchor_box, + gt_boxes=gt_boxes) + """ helper = LayerHelper('rpn_target_assign', **locals()) @@ -1519,27 +1520,30 @@ def anchor_generator(input, Args: input(Variable): The input feature map, the format is NCHW. anchor_sizes(list|tuple|float): The anchor sizes of generated anchors, - given in absolute pixels e.g. [64., 128., 256., 512.]. - For instance, the anchor size of 64 means the area of this anchor equals to 64**2. + given in absolute pixels e.g. [64., 128., 256., 512.]. + For instance, the anchor size of 64 means the area of this anchor equals to 64**2. aspect_ratios(list|tuple|float): The height / width ratios of generated - anchors, e.g. [0.5, 1.0, 2.0]. + anchors, e.g. [0.5, 1.0, 2.0]. variance(list|tuple): The variances to be used in box regression deltas. - Default:[0.1, 0.1, 0.2, 0.2]. - stride(list|turple): The anchors stride across width and height, - e.g. [16.0, 16.0] + Default:[0.1, 0.1, 0.2, 0.2]. + stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0] offset(float): Prior boxes center offset. Default: 0.5 name(str): Name of the prior box op. Default: None. Returns: - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. - H is the height of input, W is the width of input, - num_anchors is the box count of each position. - Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. - Variances(Variable): The expanded variances of anchors - with a layout of [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_anchors is the box count of each position. - Each variance is in (xcenter, ycenter, w, h) format. + Anchors(Variable),Variances(Variable): + + two variables: + + - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \ + H is the height of input, W is the width of input, \ + num_anchors is the box count of each position. \ + Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + - Variances(Variable): The expanded variances of anchors \ + with a layout of [H, W, num_priors, 4]. \ + H is the height of input, W is the width of input \ + num_anchors is the box count of each position. \ + Each variance is in (xcenter, ycenter, w, h) format. Examples: @@ -1748,35 +1752,35 @@ def generate_proposals(scores, eta=1.0, name=None): """ - ** Generate proposal Faster-RCNN ** - - This operation proposes RoIs according to each box with their probability to be a foreground object and - the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals - could be used to train detection net. - - For generating proposals, this operation performs following steps: - - 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) - 2. Calculate box locations as proposals candidates. - 3. Clip boxes to image - 4. Remove predicted boxes with small area. - 5. Apply NMS to get final proposals as output. - - - Args: - scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. - N is batch size, A is number of anchors, H and W are height and width of the feature map. - bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. - im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale - between origin image size and the size of feature map. - anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, - num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. - variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. - pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. - post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. - nms_thresh(float): Threshold in NMS, 0.5 by default. - min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. - eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. + **Generate proposal Faster-RCNN** + + This operation proposes RoIs according to each box with their probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals + could be used to train detection net. + + For generating proposals, this operation performs following steps: + + 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) + 2. Calculate box locations as proposals candidates. + 3. Clip boxes to image + 4. Remove predicted boxes with small area. + 5. Apply NMS to get final proposals as output. + + Args: + scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and width of the feature map. + bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. + im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale + between origin image size and the size of feature map. + anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. + pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. + post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. + nms_thresh(float): Threshold in NMS, 0.5 by default. + min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. + eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. + """ helper = LayerHelper('generate_proposals', **locals()) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 42f4959a83fe113d6cbbe0db355249a9c203d602..9a29b2509357c93a684d736cf0d2523970fb5ff1 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -949,12 +949,11 @@ def shuffle(reader, buffer_size): is determined by argument buf_size. Args: - param reader: the original reader whose output will be shuffled. - type reader: callable - param buf_size: shuffle buffer size. - type buf_size: int - return: the new reader whose output is shuffled. - rtype: callable + reader(callable): the original reader whose output will be shuffled. + buf_size(int): shuffle buffer size. + + Returns: + callable: the new reader whose output is shuffled. """ return __create_unshared_decorated_reader__( 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ee165d092c1571f1775ac1072fc00777d1c99744..8c7dd232ae20c98f7d4f39a7ad7bf38f1d211cb9 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -233,7 +233,7 @@ def fc(input, dimensions will be flatten to form the first dimension of the final matrix (height of the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to form the second dimension of the final matrix (width of the matrix). For example, suppose - `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. + `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable parameters/weights of this layer. @@ -340,9 +340,7 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) - remote_prefetch = False - if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): - remote_prefetch = True + remote_prefetch = is_sparse and (not is_distributed) if remote_prefetch: assert is_sparse is True and is_distributed is False w = helper.create_parameter( @@ -505,31 +503,33 @@ def lstm(input, In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: - $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + .. math:: + + i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) - $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) - $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) - $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) - $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} - $$ h_t = o_t \\odot tanh(c_t) $$ + h_t &= o_t \odot tanh(c_t) - - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix of weights from the input gate to the input) - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). - sigmoid is the logistic sigmoid function. - $i, f, o$ and $c$ are the input gate, forget gate, output gate, and cell activation vectors, respectively, all of which have the same size as the cell output activation vector $h$. - - The $\odot$ is the element-wise product of the vectors. - - `tanh` is the activation functions. - - $\tilde{c_t}$ is also called candidate hidden state, + - The :math:`\odot` is the element-wise product of the vectors. + - :math:`tanh` is the activation functions. + - :math:`\\tilde{c_t}` is also called candidate hidden state, which is computed based on the current input and the previous hidden state. - Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, + Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, X represensts a matrix multiplication @@ -556,14 +556,18 @@ def lstm(input, Returns: - rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) - if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) - last_h(Tensor): the hidden state of the last step of LSTM - shape is ( num_layers x batch_size x hidden_size ) - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) - last_c(Tensor): the cell state of the last step of LSTM - shape is ( num_layers x batch_size x hidden_size ) - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + rnn_out(Tensor),last_h(Tensor),last_c(Tensor): + + Three tensors, rnn_out, last_h, last_c: + + - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ + if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) + - last_h is the hidden state of the last step of LSTM \ + shape is ( num_layers x batch_size x hidden_size ) \ + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + - last_c(Tensor): the cell state of the last step of LSTM \ + shape is ( num_layers x batch_size x hidden_size ) \ + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) Examples: @@ -1220,6 +1224,8 @@ def dropout(x, probability) the outputs of some units to zero, while others are remain unchanged. + dropout op can be removed from the program to make the program more efficient. + Args: x (Variable): The input tensor variable. dropout_prob (float): Probability of setting units to zero. @@ -1230,20 +1236,22 @@ def dropout(x, units will be dropped. DO NOT use a fixed seed in training. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. - dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train'] + dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train'] + 1. downgrade_in_infer(default), downgrade the outcome at inference - train: out = input * mask - inference: out = input * dropout_prob - (make is a tensor same shape with input, value is 0 or 1 - ratio of 0 is dropout_prob) + + - train: out = input * mask + - inference: out = input * dropout_prob + + (mask is a tensor same shape with input, value is 0 or 1 + ratio of 0 is dropout_prob) 2. upscale_in_train, upscale the outcome at training time - train: out = input * mask / ( 1.0 - dropout_prob ) - inference: out = input - (make is a tensor same shape with input, value is 0 or 1 - ratio of 0 is dropout_prob) - dropout op can be removed from the program. - the program will be efficient + - train: out = input * mask / ( 1.0 - dropout_prob ) + - inference: out = input + + (mask is a tensor same shape with input, value is 0 or 1 + ratio of 0 is dropout_prob) Returns: @@ -1333,11 +1341,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): A 2-D tensor with shape [N x 1], the cross entropy loss. Raises: - `ValueError`: 1) the 1st dimension of `input` and `label` are not equal. - 2) when `soft_label == True`, and the 2nd dimension of - `input` and `label` are not equal. - 3) when `soft_label == False`, and the 2nd dimension of - `label` is not 1. + ValueError: + + 1. the 1st dimension of ``input`` and ``label`` are not equal. + + 2. when ``soft_label == True``, and the 2nd dimension of + ``input`` and ``label`` are not equal. + + 3. when ``soft_label == False``, and the 2nd dimension of + ``label`` is not 1. Examples: .. code-block:: python @@ -1458,7 +1470,7 @@ def chunk_eval(input, F1-score of chunk detection. For some basics of chunking, please refer to - 'Chunking with Support Vector Machines '. + `Chunking with Support Vector Machines `_ . ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. @@ -1823,7 +1835,7 @@ def conv2d(input, of conv2d. If it is set to None or one attribute of ParamAttr, conv2d will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with :math:`Normal(0.0, std)`, - and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. If it is set to False, no bias will be added to the output units. If it is set to None or one attribute of ParamAttr, conv2d @@ -2276,7 +2288,7 @@ def sequence_slice(input, offset, length, name=None): .. code-block:: text - - Case: + - Case: Given the input Variable **input**: @@ -2292,7 +2304,8 @@ def sequence_slice(input, offset, length, name=None): out.lod = [[2, 1]], out.dims = (3, 2). - NOTE: The first dimension size of **input**, **offset** and **length** + Note: + The first dimension size of **input**, **offset** and **length** should be equal. The **offset** should start from 0. Args: @@ -2540,12 +2553,12 @@ def adaptive_pool2d(input, Examples: .. code-block:: python - # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], + # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], # output shape is [N, C, m, n], adaptive pool divide H and W dimentions - # of input data into m * n grids averagely and performs poolings in each + # of input data into m * n grids averagely and performs poolings in each # grid to get output. # adaptive average pool performs calculations as follow: - # + # # for i in range(m): # for j in range(n): # hstart = floor(i * H / m) @@ -2570,12 +2583,7 @@ def adaptive_pool2d(input, raise ValueError( "invalid setting 'require_index' true when 'pool_type' is 'avg'.") - def _is_list_or_tuple_(data): - return (isinstance(data, list) or isinstance(data, tuple)) - - if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2: - raise ValueError( - "'pool_size' should be a list or tuple with length as 2.") + pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') if pool_type == "max": l_type = 'max_pool2d_with_index' @@ -2639,10 +2647,10 @@ def adaptive_pool3d(input, # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions - # of input data into l * m * n grids averagely and performs poolings in each + # of input data into l * m * n grids averagely and performs poolings in each # grid to get output. # adaptive average pool performs calculations as follow: - # + # # for i in range(l): # for j in range(m): # for k in range(n): @@ -2652,7 +2660,7 @@ def adaptive_pool3d(input, # hend = ceil((j + 1) * H / m) # wstart = floor(k * W / n) # wend = ceil((k + 1) * W / n) - # output[:, :, i, j, k] = + # output[:, :, i, j, k] = # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) # data = fluid.layers.data( @@ -2671,12 +2679,7 @@ def adaptive_pool3d(input, raise ValueError( "invalid setting 'require_index' true when 'pool_type' is 'avg'.") - def _is_list_or_tuple_(data): - return (isinstance(data, list) or isinstance(data, tuple)) - - if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3: - raise ValueError( - "'pool_size' should be a list or tuple with length as 3.") + pool_size = utils.convert_to_list(pool_size, 3, 'pool_size') if pool_type == "max": l_type = 'max_pool3d_with_index' @@ -3013,7 +3016,7 @@ def group_norm(input, """ **Group Normalization Layer** - Refer to `Group Normalization ` + Refer to `Group Normalization `_ . Args: input(Variable): The input tensor variable. @@ -3140,8 +3143,8 @@ def conv2d_transpose(input, H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\ - H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ - W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) + H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ + W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) Args: input(Variable): The input image with [N, C, H, W] format. @@ -4704,9 +4707,9 @@ def ctc_greedy_decoder(input, blank, name=None): name (str): The name of this layer. It is optional. Returns: - Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. - 'Lp' is the sum if all output sequences' length. If all the sequences - in result were empty, the result LoDTensor will be [-1] with + Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \ + 'Lp' is the sum if all output sequences' length. If all the sequences \ + in result were empty, the result LoDTensor will be [-1] with \ LoD [[]] and dims [1, 1]. Examples: @@ -5029,12 +5032,18 @@ def nce(input, else: num_neg_samples = int(num_neg_samples) + remote_prefetch = is_sparse + print( + "With sparse mode, if your models has only small parameter prefetch may cause speed down" + ) + attrs = { 'num_total_classes': int(num_total_classes), 'num_neg_samples': num_neg_samples, 'seed': seed, 'sampler': sampler, - 'is_sparse': is_sparse + 'is_sparse': is_sparse, + 'remote_prefetch': remote_prefetch } helper.append_op( @@ -5074,13 +5083,13 @@ def hsigmoid(input, `_ And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first: - 1. using your word dict to build a binary tree, each leaf node should be an word of your word dict - 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. - 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code - means label of each binary classification, using 1 indicate true, 0 indicate false. - 4. now, each word should has its path and code along the path, you can pass a batch of path and code - related to the same batch of inputs. + 1. using your word dict to build a binary tree, each leaf node should be an word of your word dict + 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. + 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code + means label of each binary classification, using 1 indicate true, 0 indicate false. + 4. now, each word should has its path and code along the path, you can pass a batch of path and code + related to the same batch of inputs. Args: input (Variable): The input tensor variable with shape @@ -5144,7 +5153,10 @@ def hsigmoid(input, pass weights = None - + remote_prefetch = is_sparse + print( + "With sparse mode, if your models has only small parameter prefetch may cause speed down" + ) if not is_custom: weights = helper.create_parameter( attr=helper.param_attr, @@ -5160,7 +5172,7 @@ def hsigmoid(input, inputs = { "X": input, "W": weights, - "PTable": path_table, + "PathTable": path_table, "PathCode": path_code, "Label": label } @@ -5183,9 +5195,13 @@ def hsigmoid(input, type="hierarchical_sigmoid", inputs=inputs, outputs={"Out": out, - "PreOut": pre_out}, - attrs={"num_classes": num_classes, - "is_sparse": is_sparse}) + "PreOut": pre_out, + "W_Out": weights}, + attrs={ + "num_classes": num_classes, + "is_sparse": is_sparse, + "remote_prefetch": remote_prefetch + }) return out @@ -5487,11 +5503,11 @@ def softmax_with_cross_entropy(logits, .. math:: - max_j = \\max_{i=0}^{K}{\\text{logit}_i} + max_j &= \\max_{i=0}^{K}{\\text{logit}_i} - log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j) + log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j) - softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j) + softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j) and then cross entropy loss is calculated by softmax and label. @@ -5517,11 +5533,11 @@ def softmax_with_cross_entropy(logits, along with the cross entropy loss. Default: False Returns: - Variable or Tuple of two Variables: Return the cross entropy loss if - `return_softmax` is False, otherwise the tuple - (loss, softmax), where the cross entropy loss is - a 2-D tensor with shape [N x 1], and softmax is a - 2-D tensor with shape [N x K]. + Variable or Tuple of two Variables: Return the cross entropy loss if \ + `return_softmax` is False, otherwise the tuple \ + (loss, softmax), where the cross entropy loss is \ + a 2-D tensor with shape [N x 1], and softmax is a \ + 2-D tensor with shape [N x K]. Examples: .. code-block:: python @@ -5794,21 +5810,27 @@ def squeeze(input, axes, name=None): the single dimensions will be removed from the shape. If an axis is selected with shape entry not equal to one, an error is raised. - Examples: - Case 1: - Given - X.shape = (1, 3, 1, 5) - and - axes = [0] - we get: - Out.shape = (3, 1, 5) - Case 2: - Given - X.shape = (1, 3, 1, 5) - and - axes = [] - we get: - Out.shape = (3, 5) + For example: + + .. code-block:: text + + Case 1: + + Given + X.shape = (1, 3, 1, 5) + and + axes = [0] + we get: + Out.shape = (3, 1, 5) + + Case 2: + + Given + X.shape = (1, 3, 1, 5) + and + axes = [] + we get: + Out.shape = (3, 5) Args: input (Variable): The input variable to be squeezed. @@ -5844,6 +5866,9 @@ def unsqueeze(input, axes, name=None): Dimension indices in axes are as seen in the output tensor. For example: + + .. code-block:: text + Given a tensor such that tensor with shape [3, 4, 5], then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. @@ -6731,8 +6756,11 @@ def sequence_scatter(input, index, updates, name=None): the columns to update in each row of X. Here is an example: + Given the following input: + .. code-block:: text + input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] @@ -6745,7 +6773,9 @@ def sequence_scatter(input, index, updates, name=None): updates.lod = [[ 0, 3, 8, 12]] Then we have the output: + .. code-block:: text + out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0], [1.0, 1.0, 1.4, 1.3, 1.2, 1.1], [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]] @@ -6761,7 +6791,7 @@ def sequence_scatter(input, index, updates, name=None): name (str|None): The output variable name. Default None. Returns: - output (Variable): The output is a tensor with the same shape as input. + Variable: The output is a tensor with the same shape as input. Examples: @@ -6935,7 +6965,7 @@ def mean_iou(input, label, num_classes): .. math:: - IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}. + IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}. The predictions are accumulated in a confusion matrix and mean-IOU is then calculated from it. @@ -6948,9 +6978,13 @@ def mean_iou(input, label, num_classes): num_classes (int): The possible number of labels. Returns: - mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1]. - out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class. - out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. + mean_iou (Variable),out_wrong(Variable),out_correct(Variable): + + Three variables: + + - mean_iou : A Tensor representing the mean intersection-over-union with shape [1]. + - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class. + - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class. Examples: @@ -7146,7 +7180,7 @@ def affine_grid(theta, out_shape, name=None): Args: theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. - out_shape can be a Variable or a list or tuple. + ``out_shape`` can be a Variable or a list or tuple. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -7159,6 +7193,7 @@ def affine_grid(theta, out_shape, name=None): Examples: .. code-block:: python + theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32") out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32") data = fluid.layers.affine_grid(theta, out_shape) @@ -7194,9 +7229,10 @@ def affine_grid(theta, out_shape, name=None): def rank_loss(label, left, right, name=None): """ + **Rank loss layer for RankNet** - RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf) + `RankNet `_ is a pairwise ranking model with a training sample consisting of a pair of documents, A and B. Label P indicates whether A is ranked higher than B or not: @@ -7204,16 +7240,19 @@ def rank_loss(label, left, right, name=None): P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information about the rank of the input pair. - Rank loss layer takes three inputs: left (o_i), right (o_j) and - label (P_{i,j}). The inputs respectively represent RankNet's output scores + Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and + label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores for documents A and B and the value of label P. The following equation computes rank loss C_{i,j} from the inputs: - $$ - C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\ - o_{i,j} = o_i - o_j \\ - \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} - $$ + .. math:: + + C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\ + + o_{i,j} &= o_i - o_j \\\\ + + \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \} + Rank loss layer takes batch inputs with size batch_size (batch_size >= 1). @@ -7239,7 +7278,6 @@ def rank_loss(label, left, right, name=None): right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") out = fluid.layers.rank_loss(label, left, right) - """ helper = LayerHelper('rank_loss', **locals()) @@ -7271,7 +7309,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): .. math:: - rank\_loss &= max(0, -label * (left - right) + margin) + rank\_loss = max(0, -label * (left - right) + margin) Args: label (Variable): Indicates whether the left is ranked higher than the right or not. @@ -7280,12 +7318,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): margin (float): Indicates the given margin. name (str|None): A name for this layer (optional). If set None, the layer will be named automatically. + Returns: Variable: The ranking loss. + Raises: ValueError: Any of label, left, and right is not a Variable. + Examples: + .. code-block:: python + label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32") left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32") right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") @@ -7589,7 +7632,8 @@ def prelu(x, mode, param_attr=None, name=None): """ Equation: - y = \max(0, x) + alpha * \min(0, x) + .. math:: + y = \max(0, x) + \\alpha * \min(0, x) Args: x (Variable): The input tensor. @@ -7653,10 +7697,10 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): Examples: - .. code-block:: python + .. code-block:: python - x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") - y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) + x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") + y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) """ helper = LayerHelper('brelu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7685,8 +7729,8 @@ def leaky_relu(x, alpha=0.02, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") - y = fluid.layers.leaky_relu(x, alpha=0.01) + x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") + y = fluid.layers.leaky_relu(x, alpha=0.01) """ helper = LayerHelper('leaky_relu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7714,8 +7758,8 @@ def soft_relu(x, threshold=40.0, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") - y = fluid.layers.soft_relu(x, threshold=20.0) + x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") + y = fluid.layers.soft_relu(x, threshold=20.0) """ helper = LayerHelper('soft_relu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7732,22 +7776,31 @@ def flatten(x, axis=1, name=None): **Flatten layer** Flattens the input tensor into a 2D matrix. - Examples: - Case 1: - Given - X.shape = (3, 100, 100, 4) - and - axis = 2 - We get: - Out.shape = (3 * 100, 4 * 100) - - Case 2: - Given - X.shape = (3, 100, 100, 4) - and - axis = 0 - We get: - Out.shape = (1, 3 * 100 * 100 * 4) + For Example: + + .. code-block:: text + + Case 1: + + Given + X.shape = (3, 100, 100, 4) + + and + axis = 2 + + We get: + Out.shape = (3 * 100, 4 * 100) + + Case 2: + + Given + X.shape = (3, 100, 100, 4) + + and + axis = 0 + + We get: + Out.shape = (1, 3 * 100 * 100 * 4) Args: x (Variable): A tensor of rank >= axis. @@ -7761,9 +7814,9 @@ def flatten(x, axis=1, name=None): will be named automatically. Returns: - Variable: A 2D tensor with the contents of the input tensor, with input - dimensions up to axis flattened to the outer dimension of - the output and remaining input dimensions flattened into the + Variable: A 2D tensor with the contents of the input tensor, with input \ + dimensions up to axis flattened to the outer dimension of \ + the output and remaining input dimensions flattened into the \ inner dimension of the output. Raises: @@ -7803,19 +7856,23 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): The enumerated sequence has the same 1st dimension with variable `input`, and the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. - Examples: - Case 1: - Input: - X.lod = [[0, 3, 5]] - X.data = [[1], [2], [3], [4], [5]] - X.dims = [5, 1] - Attrs: - win_size = 2 - pad_value = 0 - Output: - Out.lod = [[0, 3, 5]] - Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] - Out.dims = [5, 2] + .. code-block:: text + + Case 1: + + Input: + X.lod = [[0, 3, 5]] + X.data = [[1], [2], [3], [4], [5]] + X.dims = [5, 1] + + Attrs: + win_size = 2 + pad_value = 0 + + Output: + Out.lod = [[0, 3, 5]] + Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] + Out.dims = [5, 2] Args: input (Variable): The input variable which is a index sequence. @@ -8898,6 +8955,7 @@ def similarity_focus(input, axis, indexes, name=None): SimilarityFocus Operator Generate a similarity focus mask with the same shape of input using the following method: + 1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding to the axis according to the indexes. For example, if axis=1 and indexes=[a], it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X @@ -8971,14 +9029,16 @@ def similarity_focus(input, axis, indexes, name=None): indexes(list): Indicating the indexes of the selected dimension. Returns: - Variable: A tensor variable with the same shape and same type - as the input. + Variable: A tensor variable with the same shape and same type \ + as the input. Examples: .. code-block:: python + data = fluid.layers.data( name='data', shape=[2, 3, 2, 2], dtype='float32') x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0]) + """ helper = LayerHelper('similarity_focus', **locals()) # check attrs @@ -9057,6 +9117,7 @@ def hash(input, hash_size, num_hash=1, name=None): Examples: .. code-block:: python + word_dict = paddle.dataset.imdb.word_dict() x = fluid.layers.data(shape[1], dtype='int32', lod_level=1) out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000) @@ -9077,50 +9138,52 @@ def hash(input, hash_size, num_hash=1, name=None): def grid_sampler(x, grid, name=None): """ This operation samples input X by using bilinear interpolation based on - flow field grid, which is usually gennerated by affine_grid. The grid of + flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates with shape [N, H, W] each, where grid_x is indexing the 4th dimension (in width dimension) of input data x and grid_y is indexng the 3rd dimention (in height dimension), finally results is the bilinear interpolation value of 4 nearest corner points. - Step 1: - Get (x, y) grid coordinates and scale to [0, H-1/W-1]. + .. code-block:: text - grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1) - grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1) + Step 1: + Get (x, y) grid coordinates and scale to [0, H-1/W-1]. - Step 2: - Indices input data X with grid (x, y) in each [H, W] area, and bilinear - interpolate point value by 4 nearest points. + grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1) + grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1) - wn ------- y_n ------- en - | | | - | d_n | - | | | - x_w --d_w-- grid--d_e-- x_e - | | | - | d_s | - | | | - ws ------- y_s ------- wn + Step 2: + Indices input data X with grid (x, y) in each [H, W] area, and bilinear + interpolate point value by 4 nearest points. - x_w = floor(x) // west side x coord - x_e = x_w + 1 // east side x coord - y_n = floor(y) // north side y coord - y_s = y_s + 1 // south side y coord + wn ------- y_n ------- en + | | | + | d_n | + | | | + x_w --d_w-- grid--d_e-- x_e + | | | + | d_s | + | | | + ws ------- y_s ------- wn - d_w = grid_x - x_w // distance to west side - d_e = x_e - grid_x // distance to east side - d_n = grid_y - y_n // distance to north side - d_s = y_s - grid_y // distance to south side + x_w = floor(x) // west side x coord + x_e = x_w + 1 // east side x coord + y_n = floor(y) // north side y coord + y_s = y_s + 1 // south side y coord - wn = X[:, :, y_n, x_w] // north-west point value - en = X[:, :, y_n, x_e] // north-east point value - ws = X[:, :, y_s, x_w] // south-east point value - es = X[:, :, y_s, x_w] // north-east point value + d_w = grid_x - x_w // distance to west side + d_e = x_e - grid_x // distance to east side + d_n = grid_y - y_n // distance to north side + d_s = y_s - grid_y // distance to south side - output = wn * d_e * d_s + en * d_w * d_s - + ws * d_e * d_n + es * d_w * d_n + wn = X[:, :, y_n, x_w] // north-west point value + en = X[:, :, y_n, x_e] // north-east point value + ws = X[:, :, y_s, x_w] // south-east point value + es = X[:, :, y_s, x_w] // north-east point value + + output = wn * d_e * d_s + en * d_w * d_s + + ws * d_e * d_n + es * d_w * d_n Args: x(Variable): Input data of shape [N, C, H, W]. @@ -9128,16 +9191,18 @@ def grid_sampler(x, grid, name=None): name (str, default None): The name of this layer. Returns: - out(Variable): Output of shape [N, C, H, W] data samples input X + Variable: Output of shape [N, C, H, W] data samples input X using bilnear interpolation based on input grid. - Exmples: - .. code-block:: python + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32') + theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32') + grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]}) + out = fluid.layers.grid_sampler(x=x, grid=grid) - x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32') - theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32') - grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]}) - out = fluid.layers.grid_sampler(x=x, grid=grid) """ helper = LayerHelper("grid_sampler", **locals()) @@ -9205,19 +9270,19 @@ def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** - This layer accepts an input 3D-Tensor of shape [N x M x P], and return an + This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an output Tensor of shape [N x M x P] with positional encoding value. - Refer to `Attention Is All You Need`_ . + Refer to `Attention Is All You Need `_ . .. math:: - PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})} \\\\ - PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})} \\\\ - Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i) + PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\ + PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})} \\\\ + Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i) Where: - * PE(pos, 2i): the increment for the number at even position - * PE(pos, 2i + 1): the increment for the number at odd position + - :math:`PE(pos, 2i)` : the increment for the number at even position + - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position Args: input (Variable): 3-D input tensor with shape [N x M x P] @@ -9232,6 +9297,7 @@ def add_position_encoding(input, alpha, beta, name=None): .. code-block:: python position_tensor = fluid.layers.add_position_encoding(input=tensor) + """ helper = LayerHelper('add_position_encoding', **locals()) dtype = helper.input_dtype() @@ -9264,13 +9330,13 @@ def bilinear_tensor_product(x, For example: .. math:: - out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 + out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 In this formula: - :math:`x`: the first input contains M elements, shape is [batch_size, M]. - :math:`y`: the second input contains N elements, shape is [batch_size, N]. - :math:`W_{i}`: the i-th learned weight, shape is [M, N] - - :math:`out{i}`: the i-th element of out, shape is [batch_size, size]. + - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size]. - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`. Args: @@ -9350,7 +9416,7 @@ class PyFuncRegistry(object): raise TypeError('func must be a Python function') self._func = func - # find named args using reflection + # find named args using reflection args = inspect.getargspec(self._func) if len(args[0]) == 0 and args[1] is None and args[2] is None: # Function with no inputs @@ -9361,15 +9427,15 @@ class PyFuncRegistry(object): ''' Why record self here? - 1. For debug usage. Users can call - :code:`py_func.registered_func(idx)` method + 1. For debug usage. Users can call + :code:`py_func.registered_func(idx)` method to find the registered function corresponding - to :code:`idx`. + to :code:`idx`. - 2. For increasing reference count of self. - It seems that to release Python object + 2. For increasing reference count of self. + It seems that to release Python object whose reference count is 1 would cause - segmentation fault error in C++ side. + segmentation fault error in C++ side. May be lack of Python GC in C++ side? ''' PyFuncRegistry._register_funcs.append(self) @@ -9420,7 +9486,7 @@ class PyFuncRegistry(object): def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): """ PyFunc Operator. - + User can use :code:`py_func` to register operators in Python side. The inputs of :code:`func` is :code:`LoDTensor` and outputs can be numpy array or :code:`LoDTensor`. Paddle would call the registered @@ -9438,7 +9504,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): no gradient, users should return None. This function can also be used to debug the running network. User can - add a :code:`py_func` operator without output, and print input + add a :code:`py_func` operator without output, and print input :code:`x` inside :code:`func`. Args: @@ -9446,50 +9512,50 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`. out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`. Paddle cannot infer shapes and data types of :code:`out`. Users - should create :code:`out` beforehand. + should create :code:`out` beforehand. backward_func (callable|None): backward Python function. - None means no backward. Default None. + None means no backward. Default None. skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)): - Variables that are not needed in :code:`backward_func` inputs. + Variables that are not needed in :code:`backward_func` inputs. These variables must be any of :code:`x` and :code:`out`. If set, these vars would not be inputs of :code:`backward_func`, - Only useful when :code:`backward_func` is not None. Default None. + Only useful when :code:`backward_func` is not None. Default None. Returns: out (Variable|list(Variable)|tuple(Variable)): input :code:`out` Examples: - + >>> import paddle.fluid as fluid >>> import six >>> >>> def create_tmp_var(name, dtype, shape): >>> return fluid.default_main_program().current_block().create_var( - >>> name=name, dtype=dtype, shape=shape) + >>> name=name, dtype=dtype, shape=shape) >>> >>> # tanh activation has been provided by Paddle C++ op - >>> # Here, we only use tanh to be an example to show the usage + >>> # Here, we only use tanh to be an example to show the usage >>> # of py_func >>> def tanh(x): >>> return np.tanh(x) - >>> + >>> >>> # forward input x is skipped >>> def tanh_grad(y, dy): >>> return np.array(dy) * (1 - np.square(np.array(y))) >>> >>> def debug_func(x): - >>> print(x) + >>> print(x) >>> >>> def simple_net(img, label): >>> hidden = img >>> for idx in six.moves.range(4): >>> hidden = fluid.layers.fc(hidden, size=200) >>> new_hidden = create_tmp_var(name='hidden_{}'.format(idx), - >>> dtype=hidden.dtype, shape=hidden.shape) + >>> dtype=hidden.dtype, shape=hidden.shape) >>> >>> # user-defined layers with forward and backward - >>> hidden = fluid.layers.py_func(func=tanh, x=hidden, - >>> out=new_hidden, backward_func=tanh_grad, + >>> hidden = fluid.layers.py_func(func=tanh, x=hidden, + >>> out=new_hidden, backward_func=tanh_grad, >>> skip_vars_in_backward_input=hidden) >>> >>> # user-defined debug layers to print variables @@ -9660,47 +9726,3 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out - - -class FC(layers.PyLayer): - def __init__(self, - size, - param_attr=None, - num_flatten_dims=1, - dtype=core.VarDesc.VarType.FP32): - super(FC, self).__init__() - self._size = size - self._num_flatten_dims = num_flatten_dims - self._dtype = dtype - self._helper = LayerHelper('FC', param_attr=param_attr) - - def _build_once(self, inputs): - input_shape = inputs[0].shape - param_shape = [ - reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) - ] + [self._size] - self._w = self._helper.create_parameter( - attr=self._helper.param_attr, - shape=param_shape, - dtype=self._dtype, - is_bias=False) - - def forward(self, inputs): - tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="mul", - inputs={"X": inputs[0], - "Y": self._w}, - outputs={"Out": tmp}, - attrs={ - "x_num_col_dims": self._num_flatten_dims, - "y_num_col_dims": 1 - }) - - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="sum", - inputs={"X": [tmp]}, - outputs={"Out": out}, - attrs={"use_mkldnn": False}) - return out diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 4f73194d82e57336e1c496d9e5c6d40bb0eef15c..bd2a7294694a412a381eb44f372b1ac9c033d0e4 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -20,6 +20,7 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc +from ..imperative import base as imperative_base from .layer_function_generator import templatedoc import numpy @@ -104,15 +105,15 @@ def create_global_var(shape, Args: shape(list[int]): shape of the variable - value(float): the value of the variable. The new created + value(float): the value of the variable. The new created variable will be filled with it. dtype(string): data type of the variable - persistable(bool): if this variable is persistable. + persistable(bool): if this variable is persistable. Default: False - force_cpu(bool): force this variable to be on CPU. + force_cpu(bool): force this variable to be on CPU. Default: False - name(str|None): The name of the variable. If set to None the variable - name will be generated automatically. + name(str|None): The name of the variable. If set to None the variable + name will be generated automatically. Default: None Returns: @@ -121,21 +122,26 @@ def create_global_var(shape, Examples: .. code-block:: python - var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', + var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', persistable=True, force_cpu=True, name='new_var') """ helper = LayerHelper("global_var", **locals()) var = helper.create_global_variable( - dtype=dtype, shape=shape, persistable=persistable, name=name) + dtype=dtype, + shape=shape, + persistable=persistable, + name=name, + stop_gradient=True) helper.set_variable_initializer( var, initializer=Constant( value=float(value), force_cpu=force_cpu)) + return var def cast(x, dtype): """ - This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts + This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts it to the output with :attr:`dtype`. Args: @@ -199,9 +205,9 @@ def tensor_array_to_tensor(input, axis=1, name=None): and returns that as the output. A simple example as below: - + .. code-block:: text - + Given: input.data = {[[0.6, 0.1, 0.3], @@ -210,9 +216,9 @@ def tensor_array_to_tensor(input, axis=1, name=None): [1.8]], [[2.3, 2.1], [2.5, 2.4]]} - + axis = 1 - + Then: output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1], @@ -416,9 +422,6 @@ def fill_constant_batch_size_like(input, It also sets *stop_gradient* to True. - >>> data = fluid.layers.fill_constant_batch_size_like( - >>> input=like, shape=[1], value=0, dtype='int64') - Args: input(${input_type}): ${input_comment}. @@ -434,6 +437,14 @@ def fill_constant_batch_size_like(input, Returns: ${out_comment}. + + Examples: + + .. code-block:: python + + data = fluid.layers.fill_constant_batch_size_like( + input=like, shape=[1], value=0, dtype='int64') + """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_variable_for_type_inference(dtype=dtype) @@ -516,12 +527,12 @@ def argmax(x, axis=0): def argsort(input, axis=-1, name=None): """ - Performs sorting on the input Variable along the given axis, and outputs - sorted data Varibale and its corresponding index Variable with the same + Performs sorting on the input Variable along the given axis, and outputs + sorted data Varibale and its corresponding index Variable with the same shape as :attr:`input`. .. code-block:: text - + For example, the given axis is -1 and the input Variable input = [[0.15849551, 0.45865775, 0.8563702 ], @@ -534,15 +545,15 @@ def argsort(input, axis=-1, name=None): and the sorted indices along the given axis turn outs to be - indices = [[0, 1, 2], + indices = [[0, 1, 2], [0, 2, 1]] Args: input(Variable): The input Variable for sorting. - axis(int): The axis along which to sort the input Variable. When - :attr:`axis` < 0, the actual axis will be :attr:`axis` + + axis(int): The axis along which to sort the input Variable. When + :attr:`axis` < 0, the actual axis will be :attr:`axis` + rank(:attr:`input`). Default -1, the last dimension. - name(str|None): (optional) A name for this layer. If set None, the + name(str|None): (optional) A name for this layer. If set None, the layer will be named automatically. Returns: diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 85af8fea13d5b9a1e22014fbd727e1baed3247be..fd07ff0ba3d21721fbbc46099f7dcb6937f93524 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase): Accumulate counter numbers output by chunk_eval from mini-batches and compute the precision recall and F1-score using the accumulated counter numbers. - For some basics of chunking, please refer to - 'Chunking with Support Vector Machines '. + For some basics of chunking, please refer to + `Chunking with Support Vector Machines `_ . ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection, and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. @@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase): def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks): """ Update the states based on the layers.chunk_eval() ouputs. + Args: num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch. num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch. @@ -450,9 +451,9 @@ class EditDistance(MetricBase): distance, instance_error = distance_evaluator.eval() In the above example: - 'distance' is the average of the edit distance in a pass. - 'instance_error' is the instance error rate in a pass. + - 'distance' is the average of the edit distance in a pass. + - 'instance_error' is the instance error rate in a pass. """ @@ -567,12 +568,15 @@ class DetectionMAP(object): Calculate the detection mean average precision (mAP). The general steps are as follows: + 1. calculate the true positive and false positive according to the input - of detection and labels. + of detection and labels. 2. calculate mAP value, support two versions: '11 point' and 'integral'. Please get more information from the following articles: + https://sanchom.wordpress.com/tag/average-precision/ + https://arxiv.org/abs/1512.02325 Args: @@ -613,10 +617,12 @@ class DetectionMAP(object): for data in batches: loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) - In the above example: + In the above example: + + - 'cur_map_v' is the mAP of current mini-batch. + - 'accum_map_v' is the accumulative mAP of one pass. - 'cur_map_v' is the mAP of current mini-batch. - 'accum_map_v' is the accumulative mAP of one pass. + """ def __init__(self, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 59c22d4e498814d468c78b10265b7afe35461dfb..779cb5f961639aa919827a1c1726e974fdf1cbe1 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -30,6 +30,7 @@ from .initializer import Constant from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops +from .imperative import base as imperative_base __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', @@ -301,25 +302,45 @@ class Optimizer(object): This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - params_grads = append_backward(loss, parameter_list, no_grad_set, - [error_clip_callback]) + if imperative_base.enabled(): + if parameter_list is not None: + params_grads = parameter_list + else: + program = loss.block.program + parameters = program.global_block().all_parameters() + params_grads = [] + for param in parameters: + # create gradient variable + grad_var = Variable( + block=loss.block, + name=param._ivar._grad_name(), + stop_gradient=True) + grad_var._value = param._ivar.grad_value + params_grads.append((param, grad_var)) + + optimize_ops = self._create_optimization_pass(params_grads, loss, + startup_program) + else: + params_grads = append_backward(loss, parameter_list, no_grad_set, + [error_clip_callback]) + + params_grads = sorted(params_grads, key=lambda x: x[0].name) - params_grads = sorted(params_grads, key=lambda x: x[0].name) + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads, loss, startup_program) - params_grads, table_param_and_grad, table_optimize_op = \ - self._process_distribute_lookuptable(params_grads, loss, startup_program) + params_grads = append_gradient_clip_ops(params_grads) - params_grads = append_gradient_clip_ops(params_grads) + # Add regularization if any + params_grads = append_regularization_ops(params_grads, + self.regularization) - # Add regularization if any - params_grads = append_regularization_ops(params_grads, - self.regularization) + optimize_ops = self._create_optimization_pass(params_grads, loss, + startup_program) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) - optimize_ops = self._create_optimization_pass(params_grads, loss, - startup_program) - if table_optimize_op is not None: - optimize_ops.append(table_optimize_op) - params_grads.append(table_param_and_grad) return optimize_ops, params_grads @@ -364,7 +385,8 @@ class SGDOptimizer(Optimizer): "Grad": param_and_grad[1], "LearningRate": self._create_param_lr(param_and_grad) }, - outputs={"ParamOut": param_and_grad[0]}) + outputs={"ParamOut": param_and_grad[0]}, + stop_gradient=True) return sgd_op @@ -448,7 +470,8 @@ class MomentumOptimizer(Optimizer): "VelocityOut": velocity_acc }, attrs={"mu": self._momentum, - "use_nesterov": self._use_nesterov}) + "use_nesterov": self._use_nesterov}, + stop_gradient=True) return momentum_op @@ -477,7 +500,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -533,7 +556,8 @@ class LarsMomentumOptimizer(Optimizer): "mu": self._momentum, "lars_coeff": self._lars_coeff, "lars_weight_decay": self._lars_weight_decay - }) + }, + stop_gradient=True) return momentum_op @@ -608,7 +632,8 @@ class AdagradOptimizer(Optimizer): }, outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon}) + attrs={"epsilon": self._epsilon}, + stop_gradient=True) return adagrad_op @@ -738,7 +763,8 @@ class AdamOptimizer(Optimizer): "beta2": self._beta2, "epsilon": self._epsilon, "lazy_mode": self._lazy_mode - }) + }, + stop_gradient=True) return adam_op @@ -760,13 +786,15 @@ class AdamOptimizer(Optimizer): type="scale", inputs={"X": beta1_pow_acc}, outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) + attrs={"scale": self._beta1}, + stop_gradient=True) main_block.append_op( type="scale", inputs={"X": beta2_pow_acc}, outputs={"Out": beta2_pow_acc}, - attrs={"scale": self._beta2}) + attrs={"scale": self._beta2}, + stop_gradient=True) class AdamaxOptimizer(Optimizer): @@ -877,7 +905,8 @@ class AdamaxOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon - }) + }, + stop_gradient=True) return adamax_op @@ -897,7 +926,8 @@ class AdamaxOptimizer(Optimizer): type="scale", inputs={"X": beta1_pow_acc}, outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) + attrs={"scale": self._beta1}, + stop_gradient=True) class DecayedAdagradOptimizer(Optimizer): @@ -979,7 +1009,8 @@ class DecayedAdagradOptimizer(Optimizer): }, outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon}) + attrs={"epsilon": self._epsilon}, + stop_gradient=True) return decayed_adagrad_op @@ -1075,7 +1106,8 @@ class AdadeltaOptimizer(Optimizer): "AvgSquaredUpdateOut": avg_squared_update_acc }, attrs={"epsilon": self._epsilon, - "rho": self._rho}) + "rho": self._rho}, + stop_gradient=True) return adadelta_op @@ -1224,7 +1256,8 @@ class RMSPropOptimizer(Optimizer): "decay": self._rho, "momentum": self._momentum, "centered": self._centered - }) + }, + stop_gradient=True) return rmsprop_op @@ -1345,7 +1378,8 @@ class FtrlOptimizer(Optimizer): }, attrs={"l1": self._l1, "l2": self._l1, - "lr_power": self._lr_power}) + "lr_power": self._lr_power}, + stop_gradient=True) return ftrl_op @@ -1509,7 +1543,8 @@ class ModelAverage(Optimizer): "average_window": self.average_window, "min_average_window": self.min_average_window, "max_average_window": self.max_average_window, - }) + }, + stop_gradient=True) @contextmanager def apply(self, executor, need_restore=True): diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 74cf76da951a4cea884c4fdb8591b3d4fb010300..3b066eda110275dc02e451dfaf0cfe28a3fb7a53 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy +def _is_pserver_mode(main_program): + main = main_program if main_program \ + else framework.default_main_program() + for op in main.global_block().ops: + if op.type in ["send", "recv"]: + return True + return False + + class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -128,6 +137,11 @@ class ParallelExecutor(object): build_strategy = BuildStrategy() build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id + # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + # num_trainers is 1, so the current fields of build_strategy doesn't tell if + # it's distributed model. + build_strategy.is_distribution = _is_pserver_mode( + main_program) or num_trainers > 1 # step4: get main_program, scope, local_scopes main = main_program if main_program \ @@ -148,7 +162,7 @@ class ParallelExecutor(object): trainers_endpoints), "num_trainers == len(end_points)" build_strategy.trainers_endpoints = trainers_endpoints - # step5: get persistable_vars, parameter_vars, places. persistable_vars + # step6: get persistable_vars, places. persistable_vars # need be broadcast to other local_scope. persistable_vars = set([ cpt.to_text(v.name) for v in [ @@ -164,7 +178,7 @@ class ParallelExecutor(object): places = list(map(place_obj, self._places)) - # step6: init ParallelExecutor + # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, cpt.to_text(loss_name) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index d744a00242422defb360590b193e07c6f811dcb9..e87c1d58c812f3186658a78c4280f16d34c466ee 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -185,8 +185,10 @@ def main(use_cuda, parallel): if __name__ == '__main__': - for use_cuda in (False, True): - for parallel in (False, True): - if use_cuda and not core.is_compiled_with_cuda(): - continue - main(use_cuda=use_cuda, parallel=parallel) + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + if not on_ci: + for use_cuda in (False, True): + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 5d0fd7b1b10feffe638b38e073a1e91693c2b5c4..ec8b19c7ba07a9e57a32277ff3fc34b0ea25a819 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,11 +21,13 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) + LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) + LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) endif(NOT WITH_DISTRIBUTE) if (NOT ${WITH_GPU}) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) -elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) +elseif(${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) endif() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..5535427ea8a93fdc5818cdc058aedb6fe72165ee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py @@ -0,0 +1,31 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp + + +class TestNGRAPHMeanOp(TestMeanOp): + def setUp(self): + super(TestNGRAPHMeanOp, self).setUp() + + +class TestNGRAPHFP16MeanOp(TestFP16MeanOp): + def setUp(self): + super(TestNGRAPHFP16MeanOp, self).setUp() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b42a1f73fa72b0dab936a3bb61a8893978b229ec --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function +import unittest +from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows + + +class TestNGRAPHScaleOp(TestScaleOp): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleFp16Op(TestScaleFp16Op): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows): + def init_dtype_type(self): + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index e2a9fc183ea9206efd892b23844081cb9d2fb3d3..2b0ab0cc3bc23bab140d2b7e8cb765e537ff3f5c 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -78,7 +78,6 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay if use_fast_executor: exec_strategy.use_experimental_executor = True - build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index 6cd71e39e41dae5d07e5761fc9caeca113f3b47e..ab34a51dd94fce97ae9220fb87b7d6e007ffa994 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest): self.activation = 'relu' self.add_bias = True self.add_residual_data = True + self.channels = None + self.outputs = None self.init_group() self.init_dilation() @@ -49,8 +51,9 @@ class TestConv2dFusionOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + self.output, _, _, _, _ = conv2d_forward_naive( + input, filter, self.groups, conv2d_param) + self.output = self.output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), @@ -58,19 +61,20 @@ class TestConv2dFusionOp(OpTest): } if self.add_residual_data: - residual_data = np.random.random(output.shape).astype(self.dtype) + residual_data = np.random.random(self.output.shape).astype( + self.dtype) self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype( residual_data) - output += residual_data + self.output += residual_data if self.add_bias: bias = np.random.random(self.filter_size[0]).astype(self.dtype) self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias) - output = output + bias.reshape((1, bias.size, 1, 1)) + self.output = self.output + bias.reshape((1, bias.size, 1, 1)) assert self.activation in ['relu', 'identity'] if self.activation == 'relu': - output = np.maximum(output, 0) + self.output = np.maximum(self.output, 0) self.attrs = { 'strides': self.stride, @@ -79,9 +83,12 @@ class TestConv2dFusionOp(OpTest): 'dilations': self.dilations, 'data_format': self.data_format, 'exhaustive_search': self.exhaustive_search, - 'activation': self.activation + 'activation': self.activation, + 'split_channels': self.channels } - self.outputs = {'Output': output} + self.outputs = {'Output': self.output} + + self.set_outputs() def testcuda(self): return core.is_compiled_with_cuda() @@ -117,6 +124,9 @@ class TestConv2dFusionOp(OpTest): def set_search_method(self): self.exhaustive_search = False + def set_outputs(self): + pass + class TestWithoutResidual(TestConv2dFusionOp): def init_bias_residual(self): @@ -160,5 +170,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp): self.exhaustive_search = True +class TestMultipleOutputs(TestConv2dFusionOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [1, 32, 17, 17] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [126, f_c, 3, 3] + self.channels = [84, 42] + + def set_outputs(self): + out1 = self.output[:, 0:84, :, :] + out2 = self.output[:, 84:126, :, :] + self.outputs['Outputs'] = [('out1', out1), ('out2', out2)] + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..def188bfa632b5b1bb6b2621091d0526ffa345dc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py @@ -0,0 +1,270 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +from test_conv2d_op import conv2d_forward_naive, TestConv2dOp + + +def conv2d_forward_refer(input, filter, group, conv_param): + out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, + conv_param) + out_tmp = np.zeros((in_n, out_h, out_w, out_c)) + for n in range(in_n): + for i in range(out_h): + for j in range(out_w): + for m in range(out_c): + out_tmp[n, i, j, m] = out[n, m, i, j] + return out_tmp.reshape(in_n, out_c, out_h, out_w) + + +class TestConv2dInt8Op(TestConv2dOp): + def setUp(self): + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = False + self.data_format = "AnyLayout" + self.weighttype = np.float32 + self.use_mkldnn = True + self.init_group() + self.init_dilation() + self.init_test_case() + self.init_fuse_relu() + self.init_data_type() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + filter = np.random.random(self.filter_size).astype(self.weighttype) + if self.srctype == np.uint8: + input = np.random.randint(0, 10, + self.input_size).astype(self.srctype) + else: + input = np.random.randint(-5, 5, + self.input_size).astype(self.srctype) + input_shift = (np.ones(self.input_size) * 128).astype(np.uint8) + + if self.srctype == np.int8: + filter_int = np.round(filter * self.scale_weights[0] * + 0.5).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0] * 0.5) + output1 = conv2d_forward_refer( + np.round((input.astype(np.int32) + input_shift) * + self.scale_in).astype(np.int32), filter_int, + self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + output2 = conv2d_forward_refer( + np.round((input_shift) * self.scale_in).astype(np.int32), + filter_int, self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + if self.fuse_relu: + output = np.maximum(np.round(output1 - output2), + 0).astype(self.dsttype) + else: + output = np.round(output1 - output2).astype(self.dsttype) + else: + filter_int = np.round(filter * + self.scale_weights[0]).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0]) + output1 = conv2d_forward_refer( + input.astype(np.int32), filter_int, self.groups, + conv2d_param).astype(np.float32) + if self.fuse_relu: + output = np.maximum( + np.round(output1 * (self.scale_out / ( + self.scale_in * self.scale_weights[0]))), + 0).astype(self.dsttype) + else: + output = np.round(output1 * (self.scale_out / ( + self.scale_in * + self.scale_weights[0]))).astype(self.dsttype) + + self.inputs = { + 'Input': + OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + 'exhaustive_search': self.exhaustive_search, + 'Scale_in': self.scale_in, + 'Scale_out': self.scale_out, + 'Scale_weights': self.scale_weights, + 'fuse_relu': self.fuse_relu + } + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), atol=0) + + def test_check_grad(self): + pass + + def test_check_grad_no_filter(self): + pass + + def test_check_grad_no_input(self): + pass + + def init_test_case(self): + TestConv2dOp.init_test_case(self) + f_c = self.input_size[1] // self.groups + self.filter_size = [1, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_data_type(self): + self.srctype = np.uint8 + self.dsttype = np.int8 + + def init_fuse_relu(self): + self.fuse_relu = True + + +#--------------------test conv2d u8 in and u8 out-------------------- + + +class TestConv2d(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + +class TestWithPad(TestConv2d): + def init_test_case(self): + TestConv2d.init_test_case(self) + self.pad = [1, 1] + + +class TestWithGroup(TestConv2d): + def init_group(self): + self.groups = 3 + + +class TestWithStride(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.8 + self.scale_weights = [10.0] + + +class TestWith1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [1, 3, 5, 5] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [12.0] + + +class TestWithInput1x1Filter1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 1, 1] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_group(self): + self.groups = 3 + + +def init_data_type_with_fusion(self, input_dt, fuse_relu): + self.srctype = input_dt + self.dsttype = np.uint8 if fuse_relu else np.int8 + + def init_fuse_relu(self): + self.fuse_relu = fuse_relu + + +def create_test_int8_class(parent): + + #--------------------test conv2d s8 in and u8 out-------------------- + + class TestS8U8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, True) + + #--------------------test conv2d s8 in and s8 out-------------------- + + class TestS8S8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, False) + + #--------------------test conv2d u8 in and s8 out-------------------- + + class TestU8S8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.uint8, False) + + cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1") + cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + TestS8U8Case.__name__ = cls_name_s8u8 + TestS8S8Case.__name__ = cls_name_s8s8 + TestU8S8Case.__name__ = cls_name_u8s8 + globals()[cls_name_s8u8] = TestS8U8Case + globals()[cls_name_s8s8] = TestS8S8Case + globals()[cls_name_u8s8] = TestU8S8Case + + +create_test_int8_class(TestConv2dInt8Op) +create_test_int8_class(TestWithPad) +create_test_int8_class(TestWithStride) +create_test_int8_class(TestWithGroup) +create_test_int8_class(TestWith1x1) +create_test_int8_class(TestWithInput1x1Filter1x1) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index bcb79f232bd28bcb534ff2a2a0b799297ff96b71..25a9e8d46edb663600a1c1007cdda673e348a881 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): np.sum(input_pad_masked * f_sub[k, :, :, :], axis=(1, 2, 3)) - return out + return out, in_n, out_h, out_w, out_c class TestConv2dOp(OpTest): @@ -85,8 +85,9 @@ class TestConv2dOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups, + conv2d_param) + output = output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py deleted file mode 100644 index aa19a5edc7814315edaacf6e76072f62fcf7eb55..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import paddle.fluid as fluid -import paddle -import numpy as np - - -class TestDataBalance(unittest.TestCase): - def prepare_data(self): - def fake_data_generator(): - for n in range(self.total_ins_num): - yield np.ones((3, 4)) * n, n - - # Prepare data - with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch( - fake_data_generator, batch_size=self.batch_size) - feeder = fluid.DataFeeder( - feed_list=[ - fluid.layers.data( - name='image', shape=[3, 4], dtype='float32'), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( - self.data_file_name, reader, feeder) - - def prepare_lod_data(self): - def fake_data_generator(): - for n in range(1, self.total_ins_num + 1): - d1 = (np.ones((n, 3)) * n).astype('float32') - d2 = (np.array(n).reshape((1, 1))).astype('int32') - yield d1, d2 - - # Prepare lod data - with fluid.program_guard(fluid.Program(), fluid.Program()): - with fluid.recordio_writer.create_recordio_writer( - filename=self.lod_data_file_name) as writer: - eof = False - generator = fake_data_generator() - while (not eof): - data_batch = [ - np.array([]).reshape((0, 3)), np.array([]).reshape( - (0, 1)) - ] - lod = [0] - for _ in range(self.batch_size): - try: - ins = next(generator) - except StopIteration: - eof = True - break - for i, d in enumerate(ins): - data_batch[i] = np.concatenate( - (data_batch[i], d), axis=0) - lod.append(lod[-1] + ins[0].shape[0]) - if data_batch[0].shape[0] > 0: - for i, d in enumerate(data_batch): - t = fluid.LoDTensor() - t.set(data_batch[i], fluid.CPUPlace()) - if i == 0: - t.set_lod([lod]) - writer.append_tensor(t) - writer.complete_append_tensor() - - def setUp(self): - self.use_cuda = fluid.core.is_compiled_with_cuda() - self.data_file_name = './data_balance_test.recordio' - self.lod_data_file_name = './data_balance_with_lod_test.recordio' - self.total_ins_num = 50 - self.batch_size = 12 - self.prepare_data() - self.prepare_lod_data() - - def main(self): - main_prog = fluid.Program() - startup_prog = fluid.Program() - with fluid.program_guard(main_prog, startup_prog): - data_reader = fluid.layers.io.open_files( - filenames=[self.data_file_name], - shapes=[[-1, 3, 4], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64']) - if self.use_cuda: - data_reader = fluid.layers.double_buffer(data_reader) - image, label = fluid.layers.read_file(data_reader) - - place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - - build_strategy = fluid.BuildStrategy() - build_strategy.enable_data_balance = True - parallel_exe = fluid.ParallelExecutor( - use_cuda=self.use_cuda, - main_program=main_prog, - build_strategy=build_strategy) - - if (parallel_exe.device_count > self.batch_size): - print("WARNING: Unittest TestDataBalance skipped. \ - For the result is not correct when device count \ - is larger than batch size.") - return - fetch_list = [image.name, label.name] - - data_appeared = [False] * self.total_ins_num - while (True): - try: - image_val, label_val = parallel_exe.run(fetch_list, - return_numpy=True) - except fluid.core.EOFException: - break - ins_num = image_val.shape[0] - broadcasted_label = np.ones( - (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1)) - self.assertEqual(image_val.all(), broadcasted_label.all()) - for l in label_val: - self.assertFalse(data_appeared[l[0]]) - data_appeared[l[0]] = True - for i in data_appeared: - self.assertTrue(i) - - def main_lod(self): - main_prog = fluid.Program() - startup_prog = fluid.Program() - with fluid.program_guard(main_prog, startup_prog): - data_reader = fluid.layers.io.open_files( - filenames=[self.lod_data_file_name], - shapes=[[-1, 3], [-1, 1]], - lod_levels=[1, 0], - dtypes=['float32', 'int32']) - ins, label = fluid.layers.read_file(data_reader) - - place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - build_strategy = fluid.BuildStrategy() - build_strategy.enable_data_balance = True - parallel_exe = fluid.ParallelExecutor( - use_cuda=self.use_cuda, - main_program=main_prog, - build_strategy=build_strategy) - - if parallel_exe.device_count > self.batch_size: - print("WARNING: Unittest TestDataBalance skipped. \ - For the result is not correct when device count \ - is larger than batch size.") - exit(0) - fetch_list = [ins.name, label.name] - - data_appeared = [False] * self.total_ins_num - while (True): - try: - ins_tensor, label_tensor = parallel_exe.run( - fetch_list, return_numpy=False) - except fluid.core.EOFException: - break - - ins_val = np.array(ins_tensor) - label_val = np.array(label_tensor) - ins_lod = ins_tensor.lod()[0] - self.assertEqual(ins_val.shape[1], 3) - self.assertEqual(label_val.shape[1], 1) - self.assertEqual(len(ins_lod) - 1, label_val.shape[0]) - for i in range(0, len(ins_lod) - 1): - ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:] - label_elem = label_val[i][0] - self.assertEqual(ins_elem.all(), label_elem.all()) - self.assertFalse(data_appeared[int(label_elem - 1)]) - data_appeared[int(label_elem - 1)] = True - - for i in data_appeared: - self.assertTrue(i) - - def test_all(self): - self.main() - self.main_lod() - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 07cc44aaa266af39fbf3d726ee51a9afc5cb3756..0caab08f0dc19efceadd723b474c10e1a2deb449 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase): tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep, self._lr / 2) + 0, w0_ep, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep, self._lr / 2) + 1, w1_ep, self._lr) if self._mem_opt: tr0_cmd += " --mem_opt" diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2a4e5ca0c050813785f602c5d2088466e616971..28602d3251a36130bfcfdda406aa85673e1ad4c7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -15,6 +15,18 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os + + +def skip_ci(func): + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + + def __func__(*args, **kwargs): + if on_ci: + return + return func(*args, **kwargs) + + return __func__ class TestDistSeResneXt2x2(TestDistBase): @@ -22,6 +34,7 @@ class TestDistSeResneXt2x2(TestDistBase): self._sync_mode = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -32,6 +45,7 @@ class TestDistseResnXt2x2WithMemopt(TestDistBase): self._mem_opt = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -41,6 +55,7 @@ class TestDistSeResneXt2x2Async(TestDistBase): self._sync_mode = False self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index d9ad4e2e2c7b8d0a99d917495fbc8efc6cbd188d..3d1ce6b27c935ddca0f2f5fb377e69b571e3714c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -14,14 +14,15 @@ from __future__ import print_function +import traceback import math +import collections +import six import unittest +import numpy as np + import paddle.fluid as fluid -from paddle.fluid.transpiler.distribute_transpiler import delete_ops -import traceback -import collections -import six class TranspilerTest(unittest.TestCase): @@ -520,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', - 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat' + 'recv', 'fetch_barrier' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -560,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase): 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier', - 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' + 'recv', 'recv', 'fetch_barrier' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) startup_ops = [ @@ -607,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv', - 'recv', 'concat', 'concat' + 'sum', 'split_selected_rows', 'send', 'recv', 'recv' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv', - 'recv', 'concat' + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) startup_ops = [ @@ -824,5 +823,142 @@ class TestRemoteLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +# test for remote prefetch +class TestRemoteNce(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 20 + sampler = "uniform" + nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='nce_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='nce_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.nce(input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=nid_freq_arr.tolist(), + sample_weight=None, + param_attr='nce_w', + bias_attr='nce_b', + seed=1, + num_neg_samples=5, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + + out_vars = ["nce_w"] + in_vars = ["nce_b"] + + recv_var_names = [] + + for op in trainer.blocks[0].ops: + if op.type == "recv": + for var in op.output("Out"): + recv_var_names.append(var) + + for out_var in out_vars: + self.assertFalse(out_var in recv_var_names) + for in_var in in_vars: + self.assertTrue(in_var in recv_var_names) + + +# test for remote prefetch +class TestRemoteHsigmoid(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 3 + + input = fluid.layers.data(name="input", shape=[1], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + path_table = fluid.layers.data( + name='path_table', shape=[3], dtype='int64') + path_code = fluid.layers.data( + name='path_code', shape=[3], dtype='int64') + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='hs_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[3, 1], + dtype='float32', + name='hs_b', + initializer=fluid.initializer.ConstantInitializer()) + + emb = fluid.layers.embedding( + input=input, + is_sparse=is_sparse, + size=[3, 3], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(num_total_classes)))) + + cost = fluid.layers.hsigmoid( + input=emb, + label=label, + num_classes=num_total_classes, + path_table=path_table, + path_code=path_code, + is_custom=True, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + params_to_check = list() + for op in trainer.blocks[0].ops: + if op.type == "hierarchical_sigmoid": + params_to_check = [op.input("W")[0], op.input("Bias")[0]] + for name in ["epmap", "table_names", "epmap"]: + assert op.has_attr(name) + if name == "epmap": + assert op.attr(name)[0] == u'127.0.0.1:6174' + elif name == "table_names": + assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0' + else: + assert op.attr(name) == 3 + elif op.type == "lookup_table": + params_to_check.append(op.input("W")[0]) + else: + pass + op_count = 0 + for op in trainer.blocks[0].ops: + if op.type == "recv": + assert len(op.output("Out")) == 1 + assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0' + op_count += 1 + assert op_count == 1 + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index 89476ee641f1dd295a3caca89ac41038cad317f2..81b0b667814e851e8bd47ae1a3b0bf00a9a73ecd 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): print('Skip use_cuda=True because Paddle is not compiled with cuda') return + if use_parallel_executor and os.name == 'nt': + print( + 'Skip use_parallel_executor=True because Paddle comes without parallel support on windows' + ) + return + word_dict = paddle.dataset.imdb.word_dict() train_reader = paddle.batch( paddle.dataset.imdb.train(word_dict), batch_size=batch_size) diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py new file mode 100644 index 0000000000000000000000000000000000000000..584e309befcee18ad913d935c803fdd387a92745 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +import paddle.compat as cpt + + +class TestFusedEmbeddingSeqPoolOp(OpTest): + def setUp(self): + self.op_type = "fused_embedding_seq_pool" + self.emb_size = 2 + table = np.random.random((17, self.emb_size)).astype("float32") + ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]], + [[16], [1]]]).astype("int64") + merged_ids = np.array([4, 2, 16]).astype("int64") + ids_expand = np.expand_dims(ids, axis=1) + self.lod = [[3, 1]] + self.attrs = {'is_sparse': True} + self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)} + self.outputs = { + 'Out': np.reshape( + np.array([ + table[[4, 3]] + table[[4, 3]] + table[[2, 1]], + table[[16, 1]] + ]), [len(self.lod[0]), 2 * self.emb_size]) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 2a6c93f75fad53440a2db64e4f34c9a5c22c654e..8ed5074dc2626ff58fc65d8af1340e260c029572 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, } diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py new file mode 100644 index 0000000000000000000000000000000000000000..da343dd503a62e83f431dd0ffb02a7e70be7d0d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py @@ -0,0 +1,269 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_hsigmoid_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_hsigmoid_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_hsigmoid_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + + for place in places: + self._run_hsigmoid_op_one_pserver(place, port0) + self._run_hsigmoid_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 0fe69d1bd4b1b10c09879871c8cf1fc197d1106b..1dc13ec74e8da1f13d447950b3c7822bbbecb2a7 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -18,17 +18,8 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.layers.nn import FC - - -@contextlib.contextmanager -def new_program_scope(): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield +from paddle.fluid.imperative.nn import FC +from test_imperative_base import new_program_scope class MyLayer(fluid.imperative.PyLayer): @@ -36,9 +27,11 @@ class MyLayer(fluid.imperative.PyLayer): super(MyLayer, self).__init__() def forward(self, inputs): - x = fluid.layers.relu(inputs[0]) + x = fluid.layers.relu(inputs) self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] + x = fluid.layers.elementwise_mul(x, x) + x = fluid.layers.reduce_sum(x) + return [x] class MLP(fluid.imperative.PyLayer): @@ -52,7 +45,7 @@ class MLP(fluid.imperative.PyLayer): initializer=fluid.initializer.Constant(value=0.1))) def forward(self, inputs): - x = self._fc1(inputs[0]) + x = self._fc1(inputs) x = self._fc2(x) x = fluid.layers.reduce_sum(x) return x @@ -64,13 +57,14 @@ class TestImperative(unittest.TestCase): cl = core.Layer() cl.forward([]) l = fluid.imperative.PyLayer() - l.forward([]) + self.assertRaises(NotImplementedError, l.forward, []) def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): + var_inp = fluid.imperative.base.to_variable(np_inp) l = MyLayer() - x = l(np_inp)[0] + x = l(var_inp)[0] self.assertIsNotNone(x) dy_out = x._numpy() x._backward() @@ -95,8 +89,9 @@ class TestImperative(unittest.TestCase): def test_mlp(self): np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) with fluid.imperative.guard(): + var_inp = fluid.imperative.base.to_variable(np_inp) mlp = MLP() - out = mlp(np_inp) + out = mlp(var_inp) dy_out = out._numpy() out._backward() dy_grad = mlp._fc1._w._gradient() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py new file mode 100644 index 0000000000000000000000000000000000000000..478cc13fb5bb775b3a40e674e70555fa50117836 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_base.py @@ -0,0 +1,30 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..5d97edf8768d8d2cf1ba7f826fa4d588c30f2aee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -0,0 +1,206 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope + + +class SimpleImgConvPool(fluid.imperative.PyLayer): + def __init__(self, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__() + + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) + + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x + + +class MNIST(fluid.imperative.PyLayer): + def __init__(self, param_attr=None, bias_attr=None): + super(MNIST, self).__init__() + + self._simple_img_conv_pool_1 = SimpleImgConvPool( + 1, 20, 5, 2, 2, act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + 20, 50, 5, 2, 2, act="relu") + + pool_2_shape = 50 * 8 * 8 + SIZE = 10 + scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 + self._fc = FC(10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + + def forward(self, inputs): + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) + x = self._fc(x) + return x + + +class TestImperativeMnist(unittest.TestCase): + def test_mnist_cpu_float32(self): + seed = 90 + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + # mnist = Conv2D(1, 20, 5) + mnist = MNIST() + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + + dy_param_init_value = {} + for batch_id, data in enumerate(train_reader()): + if batch_id >= 2: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) + + img = to_variable(x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mnist(img) + loss = fluid.layers.reduce_mean(cost) + dy_out = loss._numpy() + + if batch_id == 0: + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_init_value[param.name] = param._numpy() + + loss._backward() + sgd.minimize(loss) + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace()) + + # mnist = Conv2D(1, 20, 5) + mnist = MNIST() + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = mnist(img) + loss = fluid.layers.reduce_mean(cost) + sgd.minimize(loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in fluid.default_startup_program().global_block( + ).all_parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= 2: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + + fetch_list = [loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue( + np.allclose(value.all(), dy_param_init_value[key].all())) + self.assertTrue(np.allclose(static_out.all(), dy_out.all())) + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 92065abb9bf0a8c7c60b24814869f52fec49264b..2e2f9a5583bf2ca6456344bfe759c55fae1bb872 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -243,6 +243,10 @@ class TestBook(unittest.TestCase): pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) self.assertIsNotNone(pool) self.assertIsNotNone(mask) + self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg')) + pool, mask = layers.adaptive_pool2d(x, 3, require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_adaptive_pool3d(self): program = Program() @@ -255,6 +259,10 @@ class TestBook(unittest.TestCase): x, [3, 3, 3], require_index=True) self.assertIsNotNone(pool) self.assertIsNotNone(mask) + self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg')) + pool, mask = layers.adaptive_pool3d(x, 3, require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_lstm_unit(self): program = Program() diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py new file mode 100644 index 0000000000000000000000000000000000000000..cc6f40de86e302605a416c48790c74cbb431b2e3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -0,0 +1,236 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def nce(input, weight, bias, sample_weight, labels, num_classes, + num_sample_class): + samples = [] + sample_labels = [] + batch_size = input.shape[0] + num_true_class = labels.shape[1] + for i in range(batch_size): + w = 1 if sample_weight is None else sample_weight[i] + for label in labels[i]: + samples.append((i, label, True, w)) + sample_labels.append(label) + for num in range(num_sample_class): + samples.append((i, num, False, w)) + sample_labels.append(num) + # forward bias + sample_out = np.zeros(len(samples)).astype(np.float32) + if bias is not None: + for i in range(len(samples)): + sample_out[i] = bias[samples[i][1]] + # forward weight + for i in range(len(samples)): + sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + + # forward activation + sample_out = 1.0 / (1.0 + np.exp(-sample_out)) + # forward cost + out = np.zeros(batch_size).astype(np.float32) + b = 1.0 / num_classes * num_sample_class + + for i in range(len(samples)): + o = sample_out[i] + cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) + out[samples[i][0]] += cost * samples[i][3] + return (out[:, np.newaxis], np.array(sample_out).reshape( + batch_size, num_sample_class + num_true_class), + np.array(sample_labels).reshape(batch_size, + num_sample_class + num_true_class)) + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_nce_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('Input').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('Weight').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") + param.set(param_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + sample_w = scope.var('SampleWeight').get_tensor() + sample_weight = np.random.random((4, 1)).astype("float32") + sample_w.set(sample_weight, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([[0], [1], [4], [3]]) + label.set(label_array, place) + + cost = scope.var('Cost').get_tensor() + cost_w = np.zeros((4, 1)).astype("float32") + cost.set(cost_w, place) + + sample_l = scope.var('SampleLogits').get_tensor() + sample_l_w = np.zeros((4, 3)).astype("float32") + sample_l.set(sample_l_w, place) + + sample_la = scope.var('SampleLabels').get_tensor() + sample_la_w = np.zeros((4, 3)).astype("int") + sample_la.set(sample_la_w, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run nce operator + nce_op = Operator( + "nce", + Input='Input', + Weight='Weight', + Label='Label', + Bias='Bias', + Cost='Cost', + SampleLogits='SampleLogits', + SampleLabels='SampleLabels', + SampleWeight='SampleWeight', + num_total_classes=5, + num_neg_samples=2, + custom_neg_classes=list(range(2)), + sampler=0, + seed=0, + is_sparse=True, + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + nce_op.run(scope, place) + + # get and compare result + o_cost = np.array(scope.var('Cost').get_tensor()) + o_logits = np.array(scope.var('SampleLogits').get_tensor()) + o_labels = np.array(scope.var('SampleLabels').get_tensor()) + + param_array = np.ones((5, 8)).astype("float32") + for i in range(2): + param_array[i] *= param_array[i] * i + 0 * 10 + 1 + for i in range(2, 5): + param_array[i] *= param_array[i] * i + 1 * 10 + 1 + out = nce(x_array, param_array, bias_array, sample_weight, + label_array, 5, 2) + + self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6) + self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6) + self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6) + + def test_nce_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + + for place in places: + self._run_nce_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 37b9a9188ab44df81029ae6d9925ae21c1929cff..4153394c1da776d0a41e1415a09fa7d6f4b14d6d 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope", "op_callstack" + "op_namescope" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 84b0aad8acb096a32f625e32fb640599f2882d97..1c6cfce0c2b772fa78fa08fa1bfb383c1e4f7939 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -175,41 +175,61 @@ class TestCRFModel(unittest.TestCase): print(pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])[0]) - def test_update_sparse_parameter_all_reduce(self): + def _new_build_strategy(self, use_reduce=False): build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + if use_reduce: + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + return build_strategy + + def test_update_sparse_parameter_all_reduce(self): if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) + is_sparse=True, + build_strategy=self._new_build_strategy(), + use_cuda=True) + self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=False) + is_sparse=True, + build_strategy=self._new_build_strategy(), + use_cuda=False) def test_update_dense_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=True) + is_sparse=False, + build_strategy=self._new_build_strategy(), + use_cuda=True) + self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=False) + is_sparse=False, + build_strategy=self._new_build_strategy(), + use_cuda=False) def test_update_sparse_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) + is_sparse=True, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=True) self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=False) + is_sparse=True, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=False) def test_update_dense_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=True) + is_sparse=False, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=True) self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=False) + is_sparse=False, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3eecc4670152e72443f731c71d7db67ca8e02e72..9768f7db26c76b1f6fcffa24fd2ea3c0abd17aeb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -86,6 +86,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_reduce=False) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f4495d0bc8198189962d033ec18b8b67f1f47c84 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py @@ -0,0 +1,110 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from __future__ import division + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive + + +class TestPool2dMKLDNNInt8_Op(TestPool2D_Op): + def init_kernel_type(self): + self.use_mkldnn = True + + def init_data_type(self): + self.dtype = np.int8 + + def setUp(self): + TestPool2D_Op.setUp(self) + assert self.dtype in [np.int8, np.uint8 + ], 'Dtype should be int8 or uint8' + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), atol=1e-5) + + def test_check_grad(self): + pass + + +class TestCase1Avg(TestPool2dMKLDNNInt8_Op): + def init_test_case(self): + self.shape = [2, 3, 7, 7] + self.ksize = [3, 3] + self.strides = [1, 1] + self.paddings = [0, 0] + + def init_global_pool(self): + self.global_pool = False + + +class TestCase2Avg(TestPool2dMKLDNNInt8_Op): + def init_test_case(self): + self.shape = [2, 3, 7, 7] + self.ksize = [3, 3] + self.strides = [1, 1] + self.paddings = [1, 1] + + def init_global_pool(self): + self.global_pool = False + + +class TestCase0Max(TestPool2dMKLDNNInt8_Op): + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + +class TestCase1Max(TestCase1Avg): + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + +class TestCase2Max(TestCase2Avg): + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + +def create_test_s8_u8_class(parent): + class TestS8Case(parent): + def init_data_type(self): + self.dtype = np.int8 + + class TestU8Case(parent): + def init_data_type(self): + self.dtype = np.uint8 + + cls_name_s8 = "{0}_{1}".format(parent.__name__, "mkldnn_s8") + cls_name_u8 = "{0}_{1}".format(parent.__name__, "mkldnn_u8") + TestS8Case.__name__ = cls_name_s8 + TestU8Case.__name__ = cls_name_u8 + globals()[cls_name_s8] = TestS8Case + globals()[cls_name_u8] = TestU8Case + + +create_test_s8_u8_class(TestPool2dMKLDNNInt8_Op) +create_test_s8_u8_class(TestCase1Avg) +create_test_s8_u8_class(TestCase2Avg) +create_test_s8_u8_class(TestCase0Max) +create_test_s8_u8_class(TestCase1Max) +create_test_s8_u8_class(TestCase2Max) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py index 19f29c78269eef6414342a200b719e498c2037d2..7de5fefc148021d4109da2ac9f4b36c93a05a23f 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py @@ -18,35 +18,22 @@ import unittest from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 -class TestMKLDNNCase1(TestPool2D_Op): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase2(TestCase1): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase3(TestCase2): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase4(TestCase3): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase5(TestCase4): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase6(TestCase5): - def init_kernel_type(self): - self.use_mkldnn = True - +def create_test_mkldnn_class(parent): + class TestMKLDNNCase(parent): + def init_kernel_type(self): + self.use_mkldnn = True + + cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNOp") + TestMKLDNNCase.__name__ = cls_name + globals()[cls_name] = TestMKLDNNCase + + +create_test_mkldnn_class(TestPool2D_Op) +create_test_mkldnn_class(TestCase1) +create_test_mkldnn_class(TestCase2) +create_test_mkldnn_class(TestCase3) +create_test_mkldnn_class(TestCase4) +create_test_mkldnn_class(TestCase5) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 5ccdf082e8a4f8aabcd55b6b470a77690ee6f61f..92515add599522625ed8506ec4fa4f002d2777b5 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -115,7 +115,7 @@ class TestPool2D_Op(OpTest): self.op_type = "pool2d" self.use_cudnn = False self.use_mkldnn = False - self.dtype = np.float32 + self.init_data_type() self.init_test_case() self.init_global_pool() self.init_kernel_type() @@ -177,6 +177,9 @@ class TestPool2D_Op(OpTest): def init_kernel_type(self): pass + def init_data_type(self): + self.dtype = np.float32 + def init_pool_type(self): self.pool_type = "avg" self.pool2D_forward_naive = avg_pool2D_forward_naive diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 943ad3ed22480193dc51375cdcca5ed36ce35158..655378f7f8c18f5936643f6c178d0e6b3dd8cac8 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -26,7 +26,7 @@ os.environ['CPU_NUM'] = str(dev_cnt) def dummy_func_with_no_input(): - return float(1.0) + return np.array([0], dtype='float32') def dummy_func_with_no_output(x): @@ -105,7 +105,7 @@ def simple_fc_net(img, label, use_py_func_op): name='test_tmp_var', dtype='float32', shape=[1]) fluid.layers.py_func( func=dummy_func_with_no_input, x=None, out=dummy_var) - + loss += dummy_var fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None) loss = fluid.layers.mean(loss) @@ -174,7 +174,7 @@ class TestPyFuncOpUseExecutor(unittest.TestCase): self.assertAlmostEqual(max_diff, 0, delta=1e-3) -class TestPyFuncOpUseParallelExecutor(unittest.TestCase): +class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor): def setUp(self): self.use_parallel_executor = True diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index d94494e219c5f348a08b4c3c2d111674ea6badf3..559386545e7304aab50043a64f7a89f48d7bbea9 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -209,6 +209,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase): else: thread = threading.Thread( target=feed_data, args=(feed_queue, reader)) + thread.daemon = True thread.start() self.outputs = [] @@ -219,6 +220,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase): feed_queue.close() self.validate() + if not use_decorate_paddle_reader: + thread.join() def validate(self): self.assertEqual(len(self.inputs), len(self.outputs)) diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index e97a05b6f929821f82d96b462598a5ff03cf0a48..7eeffa1039a1e14a8883c4a78305d253a4518b26 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase): exe.run(startup_prog) build_strategy = fluid.BuildStrategy() - if with_double_buffer: - build_strategy.enable_data_balance = True exec_strategy = fluid.ExecutionStrategy() parallel_exe = fluid.ParallelExecutor( use_cuda=self.use_cuda, diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py new file mode 100644 index 0000000000000000000000000000000000000000..f37d2bfb2e86b452cf7fd05c3e5871de2e33d629 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -0,0 +1,188 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import contextlib + +import unittest +from functools import partial +import numpy as np +import paddle +import paddle.fluid.core as core + +import paddle.fluid as fluid + + +def get_places(): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + +@contextlib.contextmanager +def prog_scope_guard(main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestWeightDecay(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=4)() + self.train_data = [next(reader) for _ in range(5)] + self.learning_rate = .5 + + def run_executor(self, place, feed_list, loss): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + main_prog = fluid.default_main_program() + loss_set = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=[loss.name]) + + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def run_parallel_exe(self, + place, + feed_list, + loss, + use_cuda=True, + use_reduce=False, + use_fast_executor=False, + use_ir_memory_optimize=False): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + exec_strategy = fluid.ExecutionStrategy() + if use_fast_executor: + exec_strategy.use_experimental_executor = True + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.memory_optimize = use_ir_memory_optimize + + parallel_exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + + loss_set = [] + for data in self.train_data: + out = parallel_exe.run(feed=feeder.feed(data), + fetch_list=[loss.name]) + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def check_weight_decay(self, + place, + model, + use_parallel_exe=False, + use_reduce=False): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + param_list = [(var, var * self.learning_rate) + for var in main_prog.block(0).all_parameters()] + + optimizer = fluid.optimizer.Adagrad( + learning_rate=self.learning_rate) + + optimizer.minimize(avg_cost) + + for params in param_list: + updated_p = fluid.layers.elementwise_sub( + x=params[0], y=params[1]) + fluid.layers.assign(input=updated_p, output=params[0]) + + if use_parallel_exe: + loss = self.run_parallel_exe( + place, [data, label], + loss=avg_cost, + use_cuda=True, + use_reduce=use_reduce) + else: + loss = self.run_executor(place, [data, label], loss=avg_cost) + + return loss + + def test_weight_decay(self): + model = partial(bow_net, is_sparse=False) + for place in get_places(): + loss = self.check_weight_decay(place, model, use_parallel_exe=False) + + loss2 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=False) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5) + + loss3 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=True) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index dc3b2cb8bc15836a4bf067caa05c3a37a917ecad..c4eb26893cd1faac72ac06c70a68c52f26b39182 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): var_dict = {} for var_proto in proto_list: var_name = str(var_proto.name) + if (var_name not in np_list) and var_proto.dispensable: + continue if is_input: - if (var_name not in np_list) and var_proto.dispensable: - continue assert (var_name in np_list) or (var_proto.dispensable), \ "Missing {} as input".format(var_name) if var_proto.duplicable: diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d21ec42dccde80fd354a730274edb04f654113c3..07343b4051e0f44996d1d4617e2cbd1a0d22ce3e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size): class DistributeTranspilerConfig(object): """ - Args: - slice_var_up (bool): Do Tensor slice for pservers, default is True. - split_method (PSDispatcher): RoundRobin or HashName can be used - try to choose the best method to balance loads for pservers. - min_block_size (int): Minimum splitted element number in block. - According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 + .. py:attribute:: slice_var_up (bool) + + Do Tensor slice for pservers, default is True. + + .. py:attribute:: split_method (PSDispatcher) + + RoundRobin or HashName can be used. + Try to choose the best method to balance loads for pservers. + + .. py:attribute:: min_block_size (int) + + Minimum number of splitted elements in block. + + According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 We can use bandwidth effiently when data size is larger than 2MB.If you - want to change it, please be sure you see the slice_variable function. + want to change it, please be sure you have read the slice_variable function. + """ slice_var_up = True @@ -242,11 +251,10 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table"] + sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( - 'remote_prefetch') is True and not op.attr( - 'is_distributed'): + 'remote_prefetch') is True: sparse_update_ops.append(op) return sparse_update_ops diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index ccf7af334d091dff5cf6a94e5875cdf582fa0c5c..cc7f5ec90c26c87b7c5514c900e853be9e16d6eb 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -57,7 +57,7 @@ class InferenceTranspiler(object): raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) diff --git a/tools/check_doc_approval.py b/tools/check_doc_approval.py new file mode 100644 index 0000000000000000000000000000000000000000..44fdf58b49a1715696e8c28746282c38fb3c7763 --- /dev/null +++ b/tools/check_doc_approval.py @@ -0,0 +1,85 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import ast +import hashlib +import importlib +import paddle.fluid + +files = [ + "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward", + "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor", + "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers", + "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer", + "paddle.fluid.profiler", "paddle.fluid.recordio_writer", + "paddle.fluid.regularizer", "paddle.fluid.transpiler" +] + + +def md5(doc): + hash = hashlib.md5() + hash.update(str(doc)) + return hash.hexdigest() + + +def get_module(): + for fi in files: + fi_lib = importlib.import_module(fi) + doc_function = getattr(fi_lib, "__all__") + for api in doc_function: + api_name = fi + "." + api + try: + doc_module = getattr(eval(api_name), "__doc__") + except: + pass + doc_md5_code = md5(doc_module) + doc_dict[api_name] = doc_md5_code + + +def doc_md5_dict(doc_md5_path): + with open(doc_md5_path, "rb") as f: + doc_md5 = f.read() + doc_md5_dict = ast.literal_eval(doc_md5) + return doc_md5_dict + + +def check_doc_md5(): + for k, v in doc_dict.items(): + try: + if doc_ci_dict[k] != v: + return doc_dict + except: + return doc_dict + return True + + +if __name__ == "__main__": + doc_dict = {} + doc_ci_dict = {} + doc_md5_file = "/root/.cache/doc_md5.txt" + if not os.path.exists(doc_md5_file): + os.mknod(doc_md5_file) + else: + doc_ci_dict = doc_md5_dict(doc_md5_file) + get_module() + if not os.path.getsize(doc_md5_file): + with open(doc_md5_file, 'w') as f: + f.write(str(doc_dict)) + check_dic = True + print(check_dic) + else: + check_dic = check_doc_md5() + print(check_dic)