diff --git a/CMakeLists.txt b/CMakeLists.txt index 1594e798a2ba3f735a28a43ef933d80b3b3f8964..cb646d3ce5d660734a27c0ac9f18ad54cd3e1c1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_PSLIB "Compile with pslib support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) @@ -131,8 +132,6 @@ if (APPLE OR WIN32) endif() if (WIN32) - set(WITH_AVX OFF CACHE STRING - "Disable AVX when compiling for Windows" FORCE) set(WITH_DSO OFF CACHE STRING "Disable DSO when compiling for Windows" FORCE) set(WITH_MKL OFF CACHE STRING @@ -217,6 +216,12 @@ include(cupti) include(external/gzstream) endif (NOT WIN32) +if(WITH_PSLIB) + include(external/libmct) + include(external/pslib_brpc) + include(external/pslib) +endif(WITH_PSLIB) + if(WITH_DISTRIBUTE) if(WITH_GRPC) include(external/grpc) @@ -284,6 +289,12 @@ set(EXTERNAL_LIBS ${PYTHON_LIBRARIES} ) +if(WITH_PSLIB) + list(APPEND EXTERNAL_LIBS pslib) + list(APPEND EXTERNAL_LIBS pslib_brpc) + list(APPEND EXTERNAL_LIBS libmct) +endif(WITH_PSLIB) + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 51f7a61631d7102b60646abe1c6dd7775692f157..4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -84,6 +84,10 @@ if(NOT WITH_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG) endif(NOT WITH_GOLANG) +if(WITH_PSLIB) + add_definitions(-DPADDLE_WITH_PSLIB) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake new file mode 100644 index 0000000000000000000000000000000000000000..27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8 --- /dev/null +++ b/cmake/external/libmct.cmake @@ -0,0 +1,78 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_LIBMCT}) + return() +ENDIF(NOT ${WITH_LIBMCT}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with LIBMCT in Paddle yet." + "Force WITH_LIBMCT=OFF") + SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(LIBMCT_PROJECT "extern_libmct") +IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE) + SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE) + SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") +SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") +SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") +SET(LIBMCT_DST_DIR "libmct") +SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) +SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) +SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") + +INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) + +FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(LIBMCT)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n" + " DESTINATION ${LIBMCT_DST_DIR})\n") + +ExternalProject_Add( + ${LIBMCT_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LIBMCT_SOURCE_DIR} + DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz + && tar zxvf ${LIBMCT_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(libmct STATIC ${dummyfile}) +else() + add_library(libmct INTERFACE) +endif() + +#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) +ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) +LIST(APPEND external_project_dependencies libmct) + diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..3b495d78e2c61f90418adbc5746792bc6e49d90b --- /dev/null +++ b/cmake/external/pslib.cmake @@ -0,0 +1,77 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB}) + return() +ENDIF(NOT ${WITH_PSLIB}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB in Paddle yet." + "Force WITH_PSLIB=OFF") + SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_PROJECT "extern_pslib") +IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") +SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") +SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") +SET(PSLIB_DST_DIR "pslib") +SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) +SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) +SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include) +SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) +SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) +SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) + +FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n" + " DESTINATION ${PSLIB_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz + && tar zxvf ${PSLIB_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) +ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) +LIST(APPEND external_project_dependencies pslib) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7ff5a8aca187240108164900638f5a376e9fbc93 --- /dev/null +++ b/cmake/external/pslib_brpc.cmake @@ -0,0 +1,77 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB_BRPC}) + return() +ENDIF(NOT ${WITH_PSLIB_BRPC}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." + "Force WITH_PSLIB_BRPC=OFF") + SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") +IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") +SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") +SET(PSLIB_BRPC_DST_DIR "pslib_brpc") +SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) +SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) +SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) +SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) +SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) + +FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB_BRPC)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n" + " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_BRPC_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz + && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) +ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) +LIST(APPEND external_project_dependencies pslib_brpc) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index a3599dd798c07f57ed82e3f25b6bb9fc4f8bdc3a..623c53f4f75bbd217c157bcdda0cb12c510269ee 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,8 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp ${PY_VERSION}) -FIND_PACKAGE(PythonLibs ${PY_VERSION}) +FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) +FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) if(WIN32) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" @@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND) "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() ENDIF(PYTHONINTERP_FOUND) - INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index b30403d2d81ce471f39b4d92e24a500fe41eeebb..f9d4cd97400a68e613e3dd5467191a0d42a9942e 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) -if (WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") -else(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") -endif (WIN32) - ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -56,6 +50,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + add_custom_command(TARGET extern_snappy POST_BUILD + COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib + ) + ENDIF() + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) add_library(snappy STATIC IMPORTED GLOBAL) set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index 4c2d64f627401071098e72bfb930fb5d62fa042d..c3e1212d8f8358e0148b5e00223414c9696686ee 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -56,7 +56,12 @@ else() endif() if (WIN32) - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") + IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") + add_custom_command(TARGET extern_xxhash POST_BUILD + COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib + ) + ENDIF() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") else() set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") endif () diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d73235453c8c9fd2859c3ab142888e8bda2dbe..d35073753725cd5772de3fc7a23af5ba69a65558 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) -IF(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) -ELSE(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) -ENDIF(WIN32) - INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. @@ -49,6 +43,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib") + add_custom_command(TARGET extern_zlib POST_BUILD + COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib + ) + ENDIF() + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE) +ELSE(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) +ENDIF(WIN32) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6..9f0adef7aa603ec5a3c8a5aa347613f462c43e60 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -32,24 +32,35 @@ function(copy TARGET) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) if (WIN32) - # windows cmd shell will not expand wildcard automatically. - # below expand the files,libs and copy them by rules. - file(GLOB header_files ${src} "*.h") - file(GLOB static_lib_files ${src} "*.lib") - file(GLOB dll_lib_files ${src} "*.dll") - set(src_files ${header_files} ${static_lib_files} ${dll_lib_files}) - - if (NOT "${src_files}" STREQUAL "") - list(REMOVE_DUPLICATES src_files) - endif () - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" - ) - foreach (src_file ${src_files}) + if(IS_DIRECTORY ${src}) + get_filename_component(last_path ${src} NAME) + string(APPEND dst "/" ${last_path}) add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" - COMMENT "copying ${src_file} -> ${dst}") - endforeach () + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + if(EXISTS ${src}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND cmake -E copy_directory "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + else() + message(WARNING "${src} not exist!") + endif() + else() + # windows cmd shell will not expand wildcard automatically. + # below expand the files, and copy them by rules. + file(GLOB src_files ${src}) + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif () + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach (src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach () + endif() else (WIN32) # not windows add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}" @@ -95,7 +106,7 @@ copy(xxhash_lib DEPS xxhash ) -if (NOT PROTOBUF_FOUND) +if (NOT PROTOBUF_FOUND OR WIN32) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} @@ -138,27 +149,25 @@ if (WITH_NGRAPH) ) endif () -if (NOT WIN32) - if (NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") - copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) +if (NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") - copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") - copy(zlib_lib - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) - endif () -endif (NOT WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) +endif () # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") @@ -191,9 +200,21 @@ if (WITH_ANAKIN AND WITH_MKL) list(APPEND inference_deps anakin_inference_lib) endif () +if (TENSORRT_FOUND) + copy(tensorrt_lib DEPS ${inference_deps} + SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* + DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) +endif () + + set(module "inference") +if(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) +else(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) +endif(WIN32) copy(inference_lib DEPS ${inference_deps} - SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} ${src_dir}/${module}/api/paddle_*.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) @@ -233,7 +254,7 @@ copy(third_party DEPS fluid_lib_dist # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library copy(inference_api_lib DEPS fluid_lib_dist - SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${paddle_fluid_lib} ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 07c6b964aa94b3cb11e9a26f2ca1d9ab75af6abe..5e9901bb87c9a454a393a913b6da6e82266ee719 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,8 +37,16 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) +paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) +paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) +paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -201,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cea4a448574cfa822ef1498cc3ea1d58a7aea43f..412bc9cbe88b860a698d17e239d7b94d8956b781 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,17 +1,18 @@ -# windows treat symbolic file as a real file, which is different with unix -# We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS PATH) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) get_filename_component(src ${src} NAME_WE) if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") endif() - # only copy the xx.cu to .xx.cu when the content are modified +#only copy the xx.cu to.xx.cu when the content are modified set(copy_flag 1) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) @@ -32,7 +33,7 @@ endfunction() add_subdirectory(ir) add_subdirectory(details) -# ddim lib +#ddim lib proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) @@ -91,8 +92,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) if (WIN32) - # windows treat symbolic file as a real file, which is different with unix - # We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. windows_symbolic(hidden_file SRCS data_type_transform.cu) nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) add_dependencies(data_type_transform hidden_file) @@ -143,7 +144,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) -# Generate an empty __init__.py to make framework_py_proto as a valid python module. +#Generate an empty \ + #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) @@ -195,7 +197,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +if(WITH_PSLIB) + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) +else() + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +endif(WITH_PSLIB) + cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index afb2dd2f064384da39904f6aceead4fa915a80f2..ee3c5e01f87eeb123f43f867296e35cc8adb7e8e 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -29,6 +29,9 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif namespace paddle { namespace framework { @@ -47,6 +50,11 @@ void AsyncExecutor::CreateThreads( worker->SetDataFeed(reader); worker->SetFetchVarNames(fetch_var_names); worker->BindingDataFeedMemory(); +#ifdef PADDLE_WITH_PSLIB + worker->SetPSlibPtr(_pslib_ptr); + worker->SetPullDenseThread(_pull_dense_thread); + worker->SetParamConfig(&_param_config); +#endif } void PrepareReaders(std::vector>& readers, // NOLINT @@ -60,12 +68,177 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } +#ifdef PADDLE_WITH_PSLIB +void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_server(dist_desc, index); + InitParamConfig(); +} + +void AsyncExecutor::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_worker( + dist_desc, const_cast(host_sign_list.data()), node_num, index); + + InitParamConfig(); +} + +uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } + +void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } + +void AsyncExecutor::GatherServers(const std::vector& host_sign_list, + int node_num) { + _pslib_ptr->gather_servers(const_cast(host_sign_list.data()), + node_num); +} + +void AsyncExecutor::InitParamConfig() { + for (int i = 0; i < _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param_size(); + ++i) { + if (_pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .table_class() + .find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .accessor() + .fea_dim(); + break; + } + } + _param_config.slot_dim = _param_config.fea_dim - 2; + _param_config.tmp_push_dense_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + ++t) { + _param_config.skip_op.push_back( + _pslib_ptr->get_param()->trainer_param().skip_op(t)); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); + std::vector tmp_sparse_variable_name; + for (int i = 0u; i < table.slot_value_size(); ++i) { + tmp_sparse_variable_name.push_back(table.slot_value(i)); + _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); + } + std::vector tmp_sparse_gradient_variable_name; + for (auto i = 0u; i < table.slot_gradient_size(); ++i) { + tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i)); + } + _param_config.slot_input_vec[table.table_id()] = + std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = + std::move(tmp_sparse_gradient_variable_name); + _param_config.sparse_table_id.push_back(table.table_id()); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); + std::vector tmp_dense_variable_name; + for (int i = 0u; i < table.dense_variable_name_size(); ++i) { + tmp_dense_variable_name.push_back(table.dense_variable_name(i)); + } + std::vector tmp_dense_gradient_variable_name; + for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { + tmp_dense_gradient_variable_name.push_back( + table.dense_gradient_variable_name(i)); + } + _param_config.dense_variable_name[table.table_id()] = + std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = + std::move(tmp_dense_gradient_variable_name); + _param_config.dense_table_id.push_back(table.table_id()); + _param_config.dense_table_size.push_back(table.fea_dim()); + } +} + +void AsyncExecutor::InitModel() { + for (auto table_id : _param_config.dense_table_id) { + std::vector regions; + for (auto& t : _param_config.dense_variable_name[table_id]) { + Variable* var = root_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + + float* g = tensor->data(); + CHECK(g != nullptr) << "var[" << t << "] value not initialized"; + + float init_range = 0.2; + int rown = tensor->dims()[0]; + init_range /= sqrt(rown); + + std::normal_distribution ndistr(0.0, 1.0); + for (auto i = 0u; i < tensor->numel(); ++i) { + g[i] = ndistr(local_random_engine()) * init_range; + } + + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + + auto push_status = _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push dense param failed, status[" << status << "]"; + exit(-1); + } + } +} + +void AsyncExecutor::SaveModel(const std::string& path) { + auto ret = _pslib_ptr->_worker_ptr->flush(); + ret.wait(); + ret = _pslib_ptr->_worker_ptr->save(path, 0); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 + LOG(FATAL) << "save model failed"; + exit(-1); + } +} + +void AsyncExecutor::PrepareDenseThread(const std::string& mode) { + if (mode == "mpi") { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr; + param.threshold = 1; + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + param.dense_params = &_param_config.dense_variable_name; + + _pull_dense_thread = + std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread->start(); + } +} +#endif + void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, const std::vector& filelist, const int thread_num, const std::vector& fetch_var_names, - const bool debug) { + const std::string& mode, const bool debug) { std::vector threads; auto& block = main_program.Block(0); @@ -82,7 +255,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, &data_feed_desc); - int actual_thread_num = thread_num; + actual_thread_num = thread_num; int file_cnt = filelist.size(); PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); @@ -106,11 +279,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); - +#ifdef PADDLE_WITH_PSLIB + PrepareDenseThread(mode); +#endif std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { +#ifdef PADDLE_WITH_PSLIB + if (mode == "mpi") { + worker.reset(new AsyncExecutorThreadWorker); + } else { + worker.reset(new ExecutorThreadWorker); + } +#else worker.reset(new ExecutorThreadWorker); +#endif } // prepare thread resource here @@ -128,7 +311,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } - +#ifdef PADDLE_WITH_PSLIB + if (mode == "mpi") { + _pull_dense_thread->stop(); + } +#endif root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index f4d2a79ac592e02f49ec0b988c824dc98883fbf6..95c8472b2f3b6b0c2d95fcf0c0b6f00e7f39b032 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -14,9 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include -#include // NOLINT +#include // NOLINT +#include // local_random_engine #include #include #include // NOLINT @@ -30,6 +32,31 @@ limitations under the License. */ namespace paddle { namespace framework { + +inline double current_realtime() { +#if !defined(_WIN32) + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +#else + return 0.0; +#endif +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + class AsyncExecutor { public: AsyncExecutor(Scope* scope, const platform::Place& place); @@ -39,7 +66,19 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, - const bool debug = false); + const std::string& mode, const bool debug = false); +#ifdef PADDLE_WITH_PSLIB + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); + uint64_t StartServer(); + void StopServer(); + void GatherServers(const std::vector& host_sign_list, int node_num); + void InitModel(); + void SaveModel(const std::string& path); + void InitParamConfig(); +#endif private: void CreateThreads(ExecutorThreadWorker* worker, @@ -48,10 +87,21 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); +#ifdef PADDLE_WITH_PSLIB + void PrepareDenseThread(const std::string& mode); +#endif public: +#ifdef PADDLE_WITH_PSLIB + std::shared_ptr _pslib_ptr; + std::shared_ptr _pull_dense_thread; + AsyncWorkerParamConfig _param_config; +#endif Scope* root_scope_; platform::Place place_; + + private: + int actual_thread_num; }; } // namespace framework diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index a99cf53b410433c6e4b8a19821779f28c25e678f..41155cfb7714b10fa51bc56fc90af4ee3d8b4a1a 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -64,6 +64,7 @@ bool DataFeed::PickOneFile(std::string* filename) { return false; } *filename = filelist_[file_idx_++]; + LOG(ERROR) << "pick file:" << *filename; return true; } diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 97f7713d97494b1d6b11ef77796fbf4f75e598dc..63a68ba3a5c289be7c2d352717fe5911539df8a7 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) +cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) - +cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle + all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +if (WITH_GPU) + list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) +endif() +cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) +cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) @@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass) + fuse_elewise_add_act_pass multi_batch_merge_pass + memory_optimize_pass) diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..223b9da3cfba33fc32d1334cddccb9f503bd0bef --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.cc @@ -0,0 +1,656 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +DEFINE_bool(enable_subgraph_optimize, false, + "SubGraph also reuse global graph variables, it will reduce the " + "memory occupation" + "but a higher risk of memory reuse error. default disabled."); +DEFINE_string(memory_optimize_debug, "", + "debug the operator output variable when do the variable reuse." + "memory reuse pass." + "only for debug, default disabled."); + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + +std::unique_ptr AnalysisVarPass::ApplyImpl( + std::unique_ptr graph) const { + auto nodes = graph->Nodes(); + auto subblock_vars = GetSubBlockVars(nodes); + skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + + cfg_.reset(new details::ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + if (OpHasSubBlock(op_desc)) { + if (FLAGS_enable_subgraph_optimize) { + SubGraphOptimize(op_desc); + } else { + VLOG(3) << op->Name() + << " has subblock, but disable subgraph optimize. skipped."; + continue; + } + } + + for (auto& var : op->outputs) { + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.NodeMatch(var); + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } + if (cache != nullptr) { + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." + << var->Name() << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + } + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + } + } + // fill the pool + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + if (var_node == nullptr) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node, op); + } + } + } + } + graph->ResolveHazard(var_nodes_); + + // For early delete pass. use GraphNodePool load the unlived vars. + // 1. find all deps op for each unlived var in memory pool. + for (auto& op : graph->Nodes()) { + for (auto& var : op->inputs) { + if (pool_.Has(var)) { + pool_.Insert(var, op); + } + } + } + // 2. convert ir node based memory pool to graph node + // because Node* maybe released bettwen passes. + auto& graph_pool = graph->Get(kGraphNodePool); + for (auto it = pool_.begin(); it != pool_.end(); ++it) { + std::unordered_set descs; + for (auto& op : it->second) { + PADDLE_ENFORCE(op->IsOp()); + descs.insert(op->Op()); + } + graph_pool.push_back(std::make_pair(it->first->Name(), descs)); + } + + return graph; +} + +void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { + // conditional block, while op and their grad op + auto* sub_block_desc = + AttrReader(op_desc->GetAttrMap()).Get("sub_block"); + + // create a mirror block to construct an IR Graph. + ProgramDesc prog; + auto* copy_block = prog.MutableBlock(0); + for (auto* op : sub_block_desc->AllOps()) { + auto* copy_op = copy_block->AppendOp(); + copy_op->CopyFrom(*op); + copy_op->Flush(); + } + + for (auto* var : sub_block_desc->AllVars()) { + auto* copy_var = copy_block->Var(var->Name()); + copy_var->SetDataType(var->GetDataType()); + // only lod tensor can be reused. So ignore the multiple dims case. + copy_var->SetType(var->GetType()); + copy_var->SetShape(var->GetShape()); + copy_var->SetPersistable(var->Persistable()); + } + + ir::Graph sub_graph(prog); + std::unordered_set sub_graph_all_ops; + FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { + // sub_graph_all_ops.emplace(var); + if (var->IsVar() && !var->IsCtrlVar()) { + sub_graph_all_ops.emplace(var); + } + }); + int sub_reuse_id = 0; + // subgraph nodes is unordered, reuse need to follow the desc order. + // find the right op node through the descs + for (auto* sub_op_desc : sub_block_desc->AllOps()) { + ir::Node* sub_op = nullptr; + for (auto* node : sub_graph_all_ops) { + if (node->Op() == sub_op_desc) { + sub_op = node; + break; + } + } + PADDLE_ENFORCE(sub_op != nullptr); + for (auto* var : sub_op->outputs) { + if (NodeCanReused(var)) { + ir::Node* cache = pool_.NodeMatch(var); + if (cache != nullptr) { + if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { + continue; + } + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(sub_reuse_id++), DebugString(var), + DebugString(cache), node_idx_in_pool, + static_cast(pool_.size())); + // NOTE(dzh): subblock is not in IR graph. Modify the block_desc + // immediately to make the subblock variable reuse strategy take + // effect. Because it is a single op in graph. No need to + // update the ir nodes. + sub_op_desc->Rename(var->Name(), cache->Name()); + if (sub_op_desc->Block()->HasVar(var->Name())) { + sub_op_desc->Block()->RemoveVar(var->Name()); + } + } + } + } + } +} + +std::unordered_set AnalysisVarPass::GetSubBlockVars( + const std::unordered_set& nodes) const { + std::unordered_set vars; + for (auto& op : nodes) { + if (!op->IsOp() || op->Op() == nullptr) continue; + auto* op_desc = op->Op(); + if (OpHasSubBlock(op_desc)) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + vars.insert(inputs.begin(), inputs.end()); + vars.insert(outputs.begin(), outputs.end()); + } + } + return vars; +} + +void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +void AnalysisVarPass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + } + + // release node of unused var in graph + for (auto* node : var_nodes_[var]) { + graph->RemoveNode(node); + } + var_nodes_.at(var).clear(); +} + +bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { + if (!node->IsVar() || node->IsCtrlVar()) return false; + auto* desc = node->Var(); + auto type = desc->GetType(); + if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || + desc->GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node->Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + if (skip_set_.count(name)) return false; + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + return framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return true; +} + +bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..144204beafb341351172c29e3b4cd41db49be6f9 --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.h @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { +constexpr char kAllOpDescs[] = "all_op_descs"; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph); +// sort op in bfs order +std::vector BFSSortGraphOps(const ir::Graph& graph); + +class ControlFlowGraph; + +class AnalysisVarPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + private: + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // valid a tensor can be reuse or not + bool NodeCanReused(ir::Node* node) const; + // scan subblock and collect the output/input variables. + std::unordered_set GetSubBlockVars( + const std::unordered_set&) const; + // check op has subblock or not + bool OpHasSubBlock(OpDesc* desc) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedNodePairPool pool_; + // controlflow Graph + mutable std::unique_ptr cfg_; + // skip set + mutable std::unordered_set skip_set_; + // var nodes + mutable std::map> var_nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // For IR Graph in parallelexecutor + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9bc4fd33f7058949ca60983ea666a21cb4877b3e --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +template +inline static std::string DebugString(const Container& c) { + std::stringstream ss; + for (auto& item : c) { + ss << item << " "; + } + return ss.str(); +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f2492992c5c0f6f5e0a85cffca7398700..779a9ed52365e66d8141f7e3a1183ef6d7832e4b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -14,11 +14,16 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" +#include +#include + +#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" namespace paddle { @@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } VLOG(1) << "CollectiveContext:" << context->String(); + // NOTE(dzh): memory optimize should be a runtime pass. + // However, after multi_devices_pass, VarHandle, OpHandle is + // the de-fact IR, any reuse on Graph is meaningless. + // A side-effect of that, memory optimize cannot forsee the fetched vars + // , so fetchlist should be set persistable before call the Run interface. + if (strategy.memory_optimize_) { + auto analysis_var_pass = AppendPass("analysis_var_pass"); + } // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy_.debug_graphviz_path_); + const std::string graph_path = + string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), + "_multi_devices_graph"); + multi_devices_print_pass->Set(kGraphvizPath, + new std::string(graph_path)); multi_devices_print_pass->Set( "graph_printer", new details::GraphvizSSAGraphPrinter); } @@ -127,7 +143,6 @@ std::unique_ptr BuildStrategy::Apply( CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); - for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); @@ -145,6 +160,17 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { + const std::vector *all_op_descs = + new std::vector(main_program.Block(0).AllOps()); + graph->Set>(kAllOpDescs, + all_op_descs); // take ownership + graph->Set(kGraphNodePool, + new GraphNodePool); // take ownership + + pass->Erase(kAllOpDescs); + pass->SetNotOwned>(kAllOpDescs, all_op_descs); + } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -166,6 +192,7 @@ std::unique_ptr BuildStrategy::Apply( } return graph; } + } // namespace details } // namespace framework } // namespace paddle @@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); +USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be169575f578dfd18a6290230d1b3f3bd7596..29396501dc0efedd31a42b77f915fd66c9943985 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -60,8 +60,15 @@ struct BuildStrategy { kCustomized = 2, }; + enum class OptimizeStrategy { + // To be Implemented,bruteforce, recursive compute unused var names. + kBruteForce = 0, + kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed. + }; + ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph}; std::string debug_graphviz_path_{""}; @@ -69,6 +76,10 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool memory_optimize_{false}; + + bool memory_early_delete_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..c8382d34b790ba7c95415acdf0b55dc97a9cd265 --- /dev/null +++ b/paddle/fluid/framework/details/early_delete_op_handle.h @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class EarlyDeleteOpHandle : public OpHandleBase { + public: + EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, + const platform::Place& place, + const std::vector& names, + GarbageCollector* gc) + : OpHandleBase(node), + scope_(scope), + place_(place), + names_(names), + gc_(gc) { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(place); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } +#endif + } + ~EarlyDeleteOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif + } + + std::string Name() const override { return "early_delete"; } + + protected: + void RunImpl() override { + std::vector> tensors; + auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + for (auto& var_name : names_) { + auto* var = local_scope->FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, + string::Sprintf("Local Scope not has var %s", var_name)); + if (var->IsType()) { + tensors.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + tensors.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } else if (var->IsType()) { + LoDTensorArray* tensor_array = var->GetMutable(); + for (auto& tensor : *tensor_array) { + tensors.emplace_back(tensor.MoveMemoryHolder()); + } + } + } + if (!tensors.empty()) { + ClearTensors(tensors); + } + } + + private: + void ClearTensors( + const std::vector>& tensors) { + if (platform::is_cpu_place(place_)) { + ClearCPUTensors(tensors); + } else { + ClearGPUTensors(tensors); + } + } + + void ClearCPUTensors( + const std::vector>& tensors) { + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + gc->Add(tensors); + } + } + + void ClearGPUTensors( + const std::vector>& tensors) { +#ifdef PADDLE_WITH_CUDA + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = gc->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { + gc_->Add(tensors); + } + } + + bool IsStreamGarabageCollector() const { + return dynamic_cast(gc_) != nullptr; +#endif + } + + const Scope* scope_; + const platform::Place place_; + std::vector names_; + GarbageCollector* gc_; +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext* dev_ctx_; + cudaEvent_t event_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..06a2451c136e3243ba41661fa691f9a6ef8b52ac --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_early_delete_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { + std::queue queue; + queue.push(var_in); + do { + auto* var = queue.front(); + queue.pop(); + for (auto* op : var->PendingOps()) { + auto* compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { + return compute_op; + } + for (auto* out_var : op->Outputs()) { + queue.push(out_var); + } + } + } while (!queue.empty()); + return nullptr; +} + +std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( + std::unique_ptr graph) const { + auto& graph_pool = Get(kGraphNodePool); + auto& gcs = Get(kGarbageCollector); + + std::unordered_map> unlived_vars; + unlived_vars.reserve(graph_pool.size()); + for (auto& pair : graph_pool) { + unlived_vars.insert(std::make_pair(pair.first, pair.second)); + } + + auto compare_and_insert_early_delete_op = [&]( + OpHandleBase* op, const std::vector& vars) { + if (unlived_vars.empty()) return; + // unlived vars can be deleted after the last used op has finished. + auto* compute_op = dynamic_cast(op); + const auto& places = Get>(kAllPlaces); + for (auto& var : vars) { + auto* var_handle = dynamic_cast(var); + auto var_name = var->Node()->Name(); + auto& var_place = var_handle->place_; + if (unlived_vars.count(var_name) == 0) continue; + if (!unlived_vars[var_name].empty()) { + if (compute_op != nullptr && + unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { + unlived_vars[var_name].erase(compute_op->Node()->Op()); + } + continue; + } + + if (var_handle == nullptr || !var_handle->Node()->IsVar() || + var_handle->Node()->IsCtrlVar()) + continue; + + // shameless copyed from reference count pass. + if (compute_op == nullptr) { + // use next computation op scope + compute_op = FindNextComputationOpHandle(var_handle); + } + auto* early_delete_node = + graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); + GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); + auto* early_delete_handle = new EarlyDeleteOpHandle( + early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); + if (compute_op->Outputs().empty()) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + early_delete_handle->AddInput(compute_op->Outputs().front()); + VLOG(5) << "Add early delete op " << var_name << " to Operator" + << compute_op->Name(); + } + }; + + auto all_ops = ir::FilterByNodeWrapper(*graph); + for (auto& op : all_ops) { + compare_and_insert_early_delete_op(op, op->Inputs()); + compare_and_insert_early_delete_op(op, op->Outputs()); + } + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_early_delete_pass, + paddle::framework::details::MemoryEarlyDeletePass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..8215aa1b2baa223a111f9050d5488c5fc8ac0e6e --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/details/early_delete_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class MemoryEarlyDeletePass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b9ff518b9adcd366cc877998400a8bdc05fa033 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +size_t NodeSizeInBytes(ir::Node* n) { + auto* desc = FindVarDescInBlock(n); + auto shape = desc->GetShape(); + size_t type_size = SizeOfType(desc->GetDataType()); + int size = 1; + for (auto& s : shape) { + size *= s; + } + return type_size * std::abs(size); +} + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(FindVarDescInBlock(var)); +} +// return DebugString(var->Var()); } + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* FindVarDescInBlock(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + BlockDesc* block = n->inputs[0]->Op()->Block(); + PADDLE_ENFORCE(block->HasVar(n->Name()), + string::Sprintf("Block do not has var %s", n->Name())); + return block->FindVar(n->Name()); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + auto* lhs_desc = FindVarDescInBlock(lhs); + auto* rhs_desc = FindVarDescInBlock(rhs); + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + } else { + return false; + } + } +}; + +void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + PADDLE_ENFORCE(op->IsOp()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->second.insert(op); + return; + } + + auto* var_desc = FindVarDescInBlock(var); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast(var_shape[0]); + + NodeComparator compare_node; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto* cache_desc = FindVarDescInBlock(it->first); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (compare_node(it->first, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = + nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + mark_table_[var->Name()] = it; +} + +int OrderedNodePairPool::GetIndex(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator compare_node; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + if (compare_node(var, it->first)) { + found_node = it->first; + break; + } + } + return found_node; +} + +void OrderedNodePairPool::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); +} + +std::string OrderedNodePairPool::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + ss << DebugString(it->first) << " "; + } + return ss.str(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h new file mode 100644 index 0000000000000000000000000000000000000000..9a9c1d948e869016717fea9ff6b8236adfc29845 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kFetchedVars[] = "fetched_vars"; +constexpr char kGraphNodePool[] = "graph_node_pool"; + +// NOTE(dzh): Variable and the operators use the var. +// for early delete pass. +// Because analysis var pass build base on ir::Node, which maybe released +// or modified between passes, so we use OpDesc* to mark ops. +using GraphNodePool = std::vector< + std::pair /* ops */>>; + +// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size is determined in runtime. +// the node batch_size equal -1 always ranking in the front than the node not. +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. +// O(1) insert, delete +class OrderedNodePairPool { + public: + using NodePair = std::pair>; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + + void Insert(ir::Node* var, ir::Node* op); + + void Erase(ir::Node* var); + + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + + ir::Node* NodeMatch(ir::Node* var) const; + // map store non-const iterator, can not promise const + int GetIndex(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map mark_table_; + // node swap pairs. var -> ops dep var + std::list nodes_; +}; + +// node memory size in bytes +size_t NodeSizeInBytes(ir::Node* n); + +std::string DebugString(ir::Node* var); + +// std::string DebugString(VarDesc* var); +VarDesc* FindVarDescInBlock(ir::Node* n); + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2fabf5ce068e0f752b86c0d02b971f18fc65f01 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(OrderedNodePairPool, Normal) { + OrderedNodePairPool pool; + std::vector> nodes; + + // clang-format off + std::vector> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get(), op.get()); + } + + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); + ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 8f92f0948d7d397ab0f20c01eae9e313f739adec..c203073845375c879a0fc10564f5dad0f19ceae4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace paddle REGISTER_PASS(multi_devices_print_pass, - paddle::framework::details::SSAGraghBuilderWithPrinter); + paddle::framework::details::SSAGraghBuilderWithPrinter) + .RequirePassAttr(paddle::framework::details::kGraphvizPath); diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index c00685fa1629c0722c315c726053c2cba8bf17e7..b06c87a5c185c550818af0bdeacd0070d1d90e4e 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGraphvizPath[] = "debug_graphviz_path"; + class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} @@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass { std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { std::unique_ptr fout( - new std::ofstream(Get("debug_graphviz_path"))); + new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get("graph_printer").Print(*graph, *fout); return graph; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61c05b3e856fffa8353d4ec5bf79bc39..b1a82e8771b92f2d0af4a1c7732ff2da54d496a8 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -25,7 +25,7 @@ namespace paddle { namespace framework { namespace details { -constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; +constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@"; // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8c3912120b5b405eb14da8493aaae6e7d12537a0..da9556c6c1f3468208db02f2958ad6ad137c6566 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -100,7 +100,7 @@ static void DeleteUnusedTensors( continue; } auto* var = scope.FindVar(name); - if (var != nullptr) { + if (var == nullptr) { continue; } diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 8725ed46d595a2edb16a71ed5f23e706448e3ecd..2eb9e564f87807e88def536ee875ebe0d1e83cd6 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor_thread_worker.h" +#include #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" @@ -32,6 +33,89 @@ limitations under the License. */ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB +int DensePullThread::start() { + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; +} + +void DensePullThread::run() { + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); + } + + usleep(_sleep_time_ms * 1000); + } +} +bool DensePullThread::check_update_param(uint64_t table_id) { + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; +} + +void DensePullThread::reset_thread_version(uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; +} +std::future DensePullThread::pull_dense(uint64_t table_id) { + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); +} + +void DensePullThread::wait_all() { + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + } + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); +} + +void DensePullThread::increase_thread_version(int thread_id, + uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; +} +#endif + void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); op_names_.clear(); @@ -202,5 +286,358 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } +#ifdef PADDLE_WITH_PSLIB +// AsyncExecutor +void AsyncExecutorThreadWorker::TrainFiles() { + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + TrainOneNetwork(); + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void AsyncExecutorThreadWorker::SetPSlibPtr( + std::shared_ptr pslib_ptr) { + _pslib_ptr = pslib_ptr; +} +void AsyncExecutorThreadWorker::SetPullDenseThread( + std::shared_ptr dpt) { + _pull_dense_thread = dpt; +} +void AsyncExecutorThreadWorker::TrainOneNetwork() { + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + UpdateParams(); +} + +void AsyncExecutorThreadWorker::SetParamConfig( + AsyncWorkerParamConfig* param_config) { + _param_config = param_config; +} + +void AsyncExecutorThreadWorker::PrepareParams() { + for (auto table_id : _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } + } + } + _pull_sparse_status.resize(0); + + for (auto table_id : _param_config->sparse_table_id) { + FillSparse(table_id); + } +} + +void AsyncExecutorThreadWorker::UpdateParams() { + for (auto i : _param_config->sparse_table_id) { + PushSparse(i); + } + for (auto i : _param_config->dense_table_id) { + PushDense(i); + } + int32_t tmp_push_dense_wait_times = -1; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); + } + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + for (auto dense_table_id : _param_config->dense_table_id) { + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } +} + +void AsyncExecutorThreadWorker::PushDense(int table_id) { + std::vector regions; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::PullSparse(int table_id) { + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + // todo(colourful-tree): current trick - filter feasign=use_slot_mod( + // bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); + } + } + check_pull_push_memory(features, &feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, &push_g, fea_dim); + + collect_feasign_info(table_id); +} + +void AsyncExecutorThreadWorker::FillSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar( + _param_config->slot_input_vec[table_id][slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = + tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + + for (auto index = 0u; index < len; ++index) { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, + sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, + sizeof(float) * slot_dim); + fea_idx++; + } + } +} + +void AsyncExecutorThreadWorker::PushSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, &push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) + << "push_g size:" << push_g.size() + << " features size:" << features.size(); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; + int offset = 2; + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == + _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx + << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != + table_id) { + continue; + } + Variable* g_var = thread_scope_->FindVar( + _param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) + << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == NULL) { + LOG(ERROR) << "var[" + << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; + exit(-1); + } + float* g = g_tensor->data(); + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; + LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) + << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len + << "t_numel:" << tensor->numel(); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx + << " size:" << fea_info.size(); + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; + } + } + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx + << " features size:" << features.size(); + CHECK_GT(features.size(), 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), (const float**)push_g_vec.data(), + features.size()); + _push_sparse_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; + } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } + } + } + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + const std::vector& features, + std::vector>* push_g, int dim) { + push_g->resize(features.size() + 1); + for (auto& t : *push_g) { + t.resize(dim); + } +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + const std::vector& features, std::vector* push_g, + int dim) { + if (features.size() > push_g->size()) { + push_g->reserve(features.size() + 1); + auto size = features.size() - push_g->size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g->push_back(ptr); + } + } +} +#endif + } // einit_modelnd namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 13ec2442c46459116320236bf98f23c91340f389..30b81ad88035eacc7a8efbe6d20f03d362122003 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -25,16 +25,119 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif namespace paddle { namespace framework { + void CreateTensor(Variable* var, proto::VarType::Type var_type); +#ifdef PADDLE_WITH_PSLIB +static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; + +struct AsyncWorkerParamConfig { + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; + + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + // fea_dim for each dense table + std::vector dense_table_size; + std::vector sparse_table_id; + std::map> slot_input_vec; + std::map> gradient_var; + std::map slot_alias_to_table; +}; + +struct DensePullThreadParam { + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; +}; + +class DensePullThread { + public: + explicit DensePullThread(const DensePullThreadParam& param) + : _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; + } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); + } + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + + private: + void run(); + bool check_update_param(uint64_t table_id); + + private: + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; +}; +#endif class ExecutorThreadWorker { public: ExecutorThreadWorker() : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} - ~ExecutorThreadWorker() {} + virtual ~ExecutorThreadWorker() {} void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); @@ -51,9 +154,15 @@ class ExecutorThreadWorker { // set data feed declared in executor void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function - void TrainFiles(); + virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); +#ifdef PADDLE_WITH_PSLIB + virtual void SetPSlibPtr( + std::shared_ptr pslib_ptr) {} + virtual void SetPullDenseThread(std::shared_ptr dpt) {} + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {} +#endif private: void CreateThreadScope(const framework::ProgramDesc& program); @@ -77,12 +186,58 @@ class ExecutorThreadWorker { Scope* root_scope_; // a thread scope, father scope is global score which is shared Scope* thread_scope_; - - private: std::vector fetch_var_names_; std::vector> fetch_values_; bool debug_; }; +#ifdef PADDLE_WITH_PSLIB +class AsyncExecutorThreadWorker : public ExecutorThreadWorker { + public: + AsyncExecutorThreadWorker() {} + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void SetParamConfig(AsyncWorkerParamConfig* param_config); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory(const std::vector& features, + std::vector* push_g, int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>* push_g, int dim); + void collect_feasign_info(int table_id); + + private: + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; +}; +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index be4151b54b6087c10989c093bf007ccf3006bd65..b7f7e2ee8ef590c0d0d8307de4400a8ce8ad4e7d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) +pass_library(conv_elementwise_add_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..476c9dbc353f865916d0065bbce653d7b7204dce --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); + +std::unique_ptr ConvElementwiseAddFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string output_name = elementwise_add_out->Name(); + + std::string act_type = "identity"; + framework::OpDesc new_op_desc(base_op_desc, nullptr); + new_op_desc.SetType("conv2d_fusion"); + new_op_desc.SetInput("Bias", {bias_name}); + new_op_desc.SetInput("ResidualData", {}); + new_op_desc.SetAttr("activation", act_type); + new_op_desc.SetOutput("Output", {output_name}); + new_op_desc.SetAttr("is_test", true); + new_op_desc.SetAttr("use_cudnn", false); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_fuse_pass, + paddle::framework::ir::ConvElementwiseAddFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..f234603f5856a9238164f7fb0e5cc81ea9b7ed60 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe28b1c68aea30caf711441823b5255c0..8670dcfed7e40473e06cd12cecc1157dd4f54aa0 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -162,7 +162,10 @@ void Graph::ResolveHazard( (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; - PADDLE_ENFORCE(write_op, "The write_op should not be empty."); + PADDLE_ENFORCE( + write_op, + string::Sprintf("The write_op of var %s should not be empty.", + (*it_new)->Name())); // Add write after write dependence ir::Node *upstream_op = diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d2d28793c4320e3664bb69c65dab4fec830e4d02..d99f856d8f46ea760ce07533446ce3bec95d7d27 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include DEFINE_string(print_sub_graph_dir, "", @@ -121,7 +122,7 @@ std::map> BuildOperationAdjList( } size_t GraphNum(const Graph &graph) { - std::unordered_set nodes = graph.Nodes(); + std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; visited_nodes.reserve(nodes.size()); std::deque q_nodes; diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 8d92c406689ab3a97596a8666ceb452aec4be170..be525151f9f9749b913a7e5111e5622d868bd266 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { + // Test if the graph contains circle. bool HasCircle(const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index bf12d12459c59304167d9c52059a068b50de3980..13d752e5167c039ec8d9e4300b190a726bb02a63 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,7 +17,6 @@ #include #include -#include "graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { return act_out; } +PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + + return elementwise_add_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0fee2f1c1852b296b3599d4b7219a032062a1d49..eaedd9d08e0fab820481d6eaacb6e7bfc1ab6d1d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase { PATTERN_DECL_NODE(act_out); }; +// Conv + ElementwiseAdd +// This pattern should be used after ConvElementwiseadd2Act or +// ConvElementwiseadd pass +struct ConvElementwiseadd : public PatternBase { + ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index eac67108e2106e986cbe1255a64c956153bc5560..45d81b937392244f678fbd01395b3ffffd07f710 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,6 +30,14 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } +std::unique_ptr CreateNodeForTest(VarDesc *var_desc) { + return std::unique_ptr(new Node(var_desc)); +} + +std::unique_ptr CreateNodeForTest(OpDesc *op_desc) { + return std::unique_ptr(new Node(op_desc)); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f19e9aab79098757dae663d030b0fa2b..89dcc677b57eba356c0b6af857f9f8ff6273a683 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" @@ -125,6 +124,8 @@ class Node { friend class Graph; friend std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); + friend std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + friend std::unique_ptr CreateNodeForTest(OpDesc* op_desc); explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} @@ -152,7 +153,9 @@ class Node { std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); +std::unique_ptr CreateNodeForTest(VarDesc* var_desc); +std::unique_ptr CreateNodeForTest(OpDesc* op_desc); } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index eb4baa06b5284512eab128e57f893bad43afda97..7e3fe02eaf5560ef03e42c6b82ed338edc30b0ab 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include #include #include #include @@ -93,6 +94,7 @@ class ParallelExecutorPrivate { } } + BuildStrategy build_strategy_; std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -169,6 +171,14 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + if (build_strategy_.memory_early_delete_) { + auto early_delete_pass = + ir::PassRegistry::Instance().Get("memory_early_delete_pass"); + early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graph = early_delete_pass->Apply(std::move(graph)); + } + VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; @@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor( build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); #endif - auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { graph = member_->PrepareGCAndRefCnts(std::move(graph), @@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor( if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle +USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index a0a9a573603ceb6b577529101cb331adbc81337a..83dea8639010f77619a6fc2a81e092ae513c6e79 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -74,6 +74,22 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); EXPECT_EQ(p1, p2); + + float* p3 = nullptr; + float* p4 = nullptr; + // set src_tensor a different type but smaller size. + // memory block is supposed to be unchanged. + auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + p3 = reinterpret_cast(tmp); + EXPECT_EQ(p1, p3); + + // set src_tensor a different type but bigger size. + // memory block is supposed to be changed. + auto* tmp2 = src_tensor.mutable_data( + framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + p4 = reinterpret_cast(tmp2); + EXPECT_NE(p1, p4); } // Not sure if it's desired, but currently, Tensor type can be changed. { diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 058a5b5f460d2bd3c4c0248929dd0c87f7506930..b80e7ef752c5251e3ea3f9d9c11f6a2b1422cd34 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -26,9 +26,6 @@ endif(WIN32) # paddle_fluid_origin exclude inference api interface if(WIN32) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) endif(WIN32) @@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) @@ -63,9 +57,6 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) target_link_libraries(paddle_fluid_shared shlwapi) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 4ffe5f575c232ccfc0089cb86e28737e56b32f94..9c42b83e7add348433635b1899087324e4e370d4 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -63,7 +63,6 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); - static int counter{0}; auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, block_desc.Proto()->SerializeAsString()); SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "engine_uniq_key", - "trt-" + std::to_string(counter++)); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); } diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index ec93729cd2b379dc2ac39b51df6799b74c8529b6..8d0d96d391efd7f0f11e9d48f5a6221431bd3824 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -15,12 +15,43 @@ macro(safe_set_static_flag) endforeach(flag_var) endmacro() +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +include_directories("${PADDLE_LIB}/") +include_directories("${PADDLE_LIB}/fluid_inference_install_dir/") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") if (WITH_STATIC_LIB) safe_set_static_flag() add_definitions(-DSTATIC_LIB) - set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w") - set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w") endif() set(CMAKE_STATIC_LIBRARY_PREFIX "lib") else() @@ -29,36 +60,15 @@ else() endif() message("flags" ${CMAKE_CXX_FLAGS}) -if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") -endif() -if(NOT DEFINED DEMO_NAME) - message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") -endif() - - if(WITH_GPU) if(NOT WIN32) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") endif() endif(NOT WIN32) endif() -include_directories("${PADDLE_LIB}") -include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") -include_directories("${PADDLE_LIB}/third_party/install/glog/include") -include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") -if (NOT WIN32) -include_directories("${PADDLE_LIB}/third_party/install/snappy/include") -include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") -include_directories("${PADDLE_LIB}/third_party/install/zlib/include") -endif(NOT WIN32) - -include_directories("${PADDLE_LIB}/third_party/boost") -include_directories("${PADDLE_LIB}/third_party/eigen3") if (NOT WIN32) if (USE_TENSORRT AND WITH_GPU) @@ -67,18 +77,6 @@ if (NOT WIN32) endif() endif(NOT WIN32) -if (NOT WIN32) -link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") -link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") -link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") -endif(NOT WIN32) - -link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") -link_directories("${PADDLE_LIB}/third_party/install/glog/lib") -link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") -link_directories("${PADDLE_LIB}/paddle/lib") - if (NOT WIN32) set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") if(EXISTS ${NGRAPH_PATH}) @@ -89,8 +87,6 @@ if (NOT WIN32) endif() endif() -add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) - if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} @@ -106,26 +102,25 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() if (NOT WIN32) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} - glog gflags protobuf snappystream snappy z xxhash - ${EXTERNAL_LIB}) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} + glog gflags protobuf snappystream snappy z xxhash + ${EXTERNAL_LIB}) else() -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf - ${EXTERNAL_LIB}) -# NOTE(dzhwinter) shlwapi is deprecated. -set(DEPS ${DEPS} libcmt shlwapi) + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf + ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash + snappystream ${EXTERNAL_LIB}) + # NOTE(dzhwinter) shlwapi is deprecated. + set(DEPS ${DEPS} libcmt shlwapi) endif(NOT WIN32) if(WITH_GPU) @@ -137,9 +132,10 @@ if(WITH_GPU) set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) endif() endif() +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index e6e7de24783b160769e0c9f43d8f0700a035c314..40ca0d287ccde113a20abb1036af289a36f54e6c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy { "conv_bn_fuse_pass", // "conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d61d635ed707bc455d495f2420925a3585234b5c..91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -103,6 +103,7 @@ class OpConverter { void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, const framework::Scope& scope, TensorRTEngine* engine) { + std::unique_lock lk(mut_); for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); ConvertOp(op, parameters, scope, engine); @@ -125,6 +126,7 @@ class OpConverter { std::unordered_map converters_; // fluid inference scope framework::Scope* scope_{nullptr}; + std::mutex mut_; }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 8a4bc04b67879918c6ac8d1b40dae68a107034d4..46ce61b73611d05369f90e7d8f97e9b6724b860f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename ARGS --infer_model=${install_dir}/model) endfunction() +function(inference_analysis_api_test_with_refer_result target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt) +endfunction() + # RNN1 if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") @@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() -inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) + +# mobilenet with transpose op +set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") +if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) + inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") +endif() +inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") # mobilenet with depthwise_conv op -inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet +inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") # anakin diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index adaa338e289936a7e6915bd23eba86863481dd06..a8f7d5c4461964bcb18bc8df24e282ea89264aa8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) { SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - const float ocr_result_data[] = { - 5.273636460856323538e-08, 3.296741795111302054e-07, - 1.873261190610264748e-08, 3.403730275408634043e-08, - 3.383312474625199684e-08}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); - PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); - for (size_t i = 0; i < std::min(5UL, size); i++) { - EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3); + std::string line; + std::ifstream file(FLAGS_refer_result); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + auto &output = outputs.front(); + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + CHECK_EQ(numel, refer.data.size()); + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); } } } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 4c8bce4600a7ac47728a045a73290bba29e6f336..b07949c196ca1d41bb33a0b0499ebb3204d1be4a 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -36,6 +36,7 @@ DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); +DEFINE_string(refer_result, "", "reference result for comparison"); DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 3b7c781795f02b9d9c9f2ead51034193ceb2a745..6a92762896b89a06a91cd11fb38587f7df69e6c3 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // At least use 32 threads, at most 512 threads. // blockx is multiple of 32. - int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int blockx = std::min( + static_cast(((feature_width * num_priors + 31) >> 5) << 5), 512L); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 2d262f932aed9761143f7983c9a38f7a97c374ea..862d664d42e03d2ae968ea0bdec8ae8e50bf7fb3 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -35,12 +35,12 @@ namespace operators { template __device__ bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; + return (a > b) || Eigen::numext::abs(a - b) < 1e-4; } template __device__ bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; + return (a < b) || Eigen::numext::abs(a - b) < 1e-4; } template diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index c3974138f4d4665c46bdfccaef09c0bd84b9d028..cda102e78d2de2876d54418574b7e07211fc92b4 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest( while (true) { VLOG(4) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { - VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; + LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!"; break; } @@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest( // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { - LOG(WARNING) << "completion queue:" << rpc_name - << " recv no regular event" - << " context:" << base->Status2String(rpc_name); + VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" + << " context:" << base->Status2String(rpc_name); TryToRegisterNewOne(rpc_name, req_id); delete base; continue; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b73a32af89e882ac02623dd1d312f400a78fc47a..d212e6f8437e69e71c010b6af27a33ff5e39e1e1 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { label.data())); } - auto& place = *ctx.template device_context().eigen_device(); - auto pre_out_mat = EigenMatrix::From(pre_out); - auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); - auto out_grad_mat = EigenMatrix::From(out_grad); + // softrelu derivative - Eigen::array bcast{1, static_cast(pre_out_grad.dims()[1])}; + auto blas = math::GetBlas(ctx); - // softrelu derivative - pre_out_grad_mat.device(place) = - static_cast(1.0) - static_cast(1.0) / pre_out_mat.exp(); + auto* pre_out_grad_data = pre_out_grad.data(); + auto* pre_out_data = pre_out.data(); + auto n = pre_out.numel(); + blas.VEXP(n, pre_out_data, pre_out_grad_data); + blas.VINV(n, pre_out_grad_data, pre_out_grad_data); + for (int64_t i = 0; i < n; ++i) { + pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; + } bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - pre_out_grad_mat.device(place) = - pre_out_grad_mat * out_grad_mat.broadcast(bcast); + auto* out_grad_data = out_grad.data(); + + int64_t dim0 = pre_out_grad.dims()[0]; + int64_t dim1 = pre_out_grad.dims()[1]; + for (int64_t i = 0; i < dim0; ++i) { + T tmp = out_grad_data[i]; + blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); + } // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 4ecd8634ff41ff4eba6b5ed1d0fc78068190dce5..253b65a5f33308fc2c94537641b0fa19378b0cc9 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); REGISTER_OP_CPU_KERNEL( - huber_loss, - ops::HuberLossKernel); + huber_loss, ops::HuberLossKernel, + ops::HuberLossKernel); REGISTER_OP_CPU_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel, + ops::HuberLossGradKernel); diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 9f3a81f22cc52bef719f472e43f91bc81dfe2af6..f67f57827bc03e134bf87edd5bf033adb5098916 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -181,6 +181,9 @@ class Blas { const framework::Tensor& mat_b, const MatDescriptor& dim_b, T alpha, framework::Tensor* mat_out, T beta) const; + template + void VINV(int n, const T* a, T* y) const; + private: const DeviceContext& context_; }; @@ -282,6 +285,11 @@ class BlasT : private Blas { Base()->template BatchedGEMM(args...); } + template + void VINV(ARGS... args) const { + Base()->template VINV(args...); + } + private: const Blas* Base() const { return static_cast*>(this); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index c84087bb1e4849b27d53e05f046c93f631150f6f..972366bc093f4b7f0a090cf31213f75ccd89fd82 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -118,6 +118,11 @@ struct CBlas { static void VPOW(ARGS... args) { platform::dynload::vsPowx(args...); } + + template + static void VINV(ARGS... args) { + platform::dynload::vsInv(args...); + } }; template <> @@ -213,6 +218,11 @@ struct CBlas { static void VPOW(ARGS... args) { platform::dynload::vdPowx(args...); } + + template + static void VINV(ARGS... args) { + platform::dynload::vdInv(args...); + } }; #else @@ -603,6 +613,17 @@ void Blas::MatMul(const framework::Tensor &mat_a, dim_a.stride_, dim_b.stride_); } } +template +template +void Blas::VINV(int n, const T *a, T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VINV(n, a, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = 1.0 / a[i]; + } +#endif +} } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index e1e4d168db3ca594b44396a6e30c5bfc03483eaf..57726956cfba802183903b436c82b15c34d8fcc9 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" -#ifdef __AVX__ -#include -#endif #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 2b3d38d95a18fad9b76e616cdf2cb6c3eb07da3a..24df1f93edd85145d703ed3277b0d1ca06e67009 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -15,14 +15,10 @@ limitations under the License. */ #pragma once #include #include - +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc index 5641f914523771f47bd7f814bfd39964a53deefc..022ffc533779363b08404b8715ac37194a4be392 100644 --- a/paddle/fluid/operators/math/detail/avx_functions.cc +++ b/paddle/fluid/operators/math/detail/avx_functions.cc @@ -14,10 +14,8 @@ limitations under the License. */ #ifdef __AVX__ -#include #include "paddle/fluid/operators/math/detail/activation_functions.h" -// TODO(qingqing) refine this dependence -#include "paddle/legacy/cuda/src/avx_mathfun.h" +#include "paddle/fluid/operators/math/detail/avx_mathfun.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/fluid/operators/math/detail/avx_mathfun.h new file mode 100644 index 0000000000000000000000000000000000000000..d7cf91134e4553dfcd935a31993e06dfa74650ac --- /dev/null +++ b/paddle/fluid/operators/math/detail/avx_mathfun.h @@ -0,0 +1,731 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include "paddle/fluid/platform/cpu_info.h" + +/* __m128 is ugly to write */ +typedef __m256 v8sf; // vector of 8 float (avx) +typedef __m256i v8si; // vector of 8 int (avx) +typedef __m128i v4si; // vector of 8 int (avx) + +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \ + Val, Val} + +_PI32AVX_CONST(1, 1); +_PI32AVX_CONST(inv1, ~1); +_PI32AVX_CONST(2, 2); +_PI32AVX_CONST(4, 4); + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} + +_PS256_CONST(1, 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + +#ifndef __AVX2__ + +typedef union imm_xmm_union { + v8si imm; + v4si xmm[2]; +} imm_xmm_union; + +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ + { \ + imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ + } + +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ + { \ + imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ + u.xmm[0] = xmm0_; \ + u.xmm[1] = xmm1_; \ + imm_ = u.imm; \ + } + +#define AVX2_BITOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, int a) { \ + /* use SSE2 instruction to perform the bitop AVX2 */ \ + v4si x1, x2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1, a); \ + x2 = _mm_##fn(x2, a); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } + +//#warning "Using SSE2 to perform AVX2 bitshift ops" +AVX2_BITOP_USING_SSE2(slli_epi32) +AVX2_BITOP_USING_SSE2(srli_epi32) + +#define AVX2_INTOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \ + /* use SSE2 instructions to perform the AVX2 integer operation */ \ + v4si x1, x2; \ + v4si y1, y2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1, y1); \ + x2 = _mm_##fn(x2, y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } + +//#warning "Using SSE2 to perform AVX2 integer ops" +AVX2_INTOP_USING_SSE2(and_si128) +AVX2_INTOP_USING_SSE2(andnot_si128) +AVX2_INTOP_USING_SSE2(cmpeq_epi32) +AVX2_INTOP_USING_SSE2(sub_epi32) +AVX2_INTOP_USING_SSE2(add_epi32) +#define avx2_mm256_and_si256 avx2_mm256_and_si128 +#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128 +#else +#define avx2_mm256_slli_epi32 _mm256_slli_epi32 +#define avx2_mm256_srli_epi32 _mm256_srli_epi32 +#define avx2_mm256_and_si256 _mm256_and_si256 +#define avx2_mm256_andnot_si256 _mm256_andnot_si256 +#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 +#define avx2_mm256_sub_epi32 _mm256_sub_epi32 +#define avx2_mm256_add_epi32 _mm256_add_epi32 +#endif /* __AVX2__ */ + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf *)_ps256_1; + + // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps( + x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf *)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + // imm0 = _mm256_cvttps_epi32(fx); + // tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, substract 1 */ + // v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + +/* evaluation of 8 sines at onces using AVX intrisics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + +#ifndef __AVX2__ + v4si imm0_1, imm0_2; + v4si imm2_1, imm2_2; +#endif + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + +/* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives +*/ + +#ifdef __AVX2__ + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4= 256 diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index e2b4761435594fdc952ff5dba5b5fa4f4aa98e6c..6d22bf675724166d0701e9a51d0d23ae00ef1048 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -47,7 +47,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -#define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index eeb305a88bee8f0e21b205684d24b19ca4631f65..ac2d29f1c18392ebf917cc097e63670e06b1eded 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -16,9 +16,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { /* AVX instructions.*/ \ __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \ __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \ - __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \ - __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \ + __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0); \ + __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1); \ lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \ hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \ lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \ diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index cb49e66488bd69d92430cbf6de1d08348ffe0202..e21092037a27d26cd31205b1b5d8e2f0cb8380cd 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -13,9 +13,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel { if (rest_ != 0) { \ j = offset + this->num_ - block; \ tmp = _mm256_loadu_ps((const float*)x + j); \ - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ + tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \ sum = _mm256_add_ps(sum, tmp); \ } \ hi = _mm256_extractf128_ps(sum, 1); \ @@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel { j = offset + this->num_ - block; \ tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \ tmp = _mm256_mul_ps(tmp, tmp); \ - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ + tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \ sum = _mm256_add_ps(sum, tmp); \ } \ hi = _mm256_extractf128_ps(sum, 1); \ diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 5a6e64b6f87d33249f0153e5f391deaf78e53de5..d55e832cc2d9a4a5e2cb7fe5cf451a1205601951 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -14,218 +14,380 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include +#include + namespace paddle { namespace operators { namespace math { template -void MatrixBitCodeFunctor::Add(const framework::Tensor& vec, - framework::Tensor* tmat) { - size_t batch_size = tmat->dims()[0]; - size_t width = tmat->dims()[1]; - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - tmat->data()[i * width + j] += vec.data()[index]; +struct MatrixBitCodeFunctorAdd : public boost::static_visitor { + const framework::Tensor &vec_; + framework::Tensor *tmat_; + + MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat) + : vec_(vec), tmat_(tmat) {} + + template + void operator()(const CodeTable &code_table) { + size_t batch_size = tmat_->dims()[0]; + size_t width = tmat_->dims()[1]; + auto *tmat_data = tmat_->data(); + auto *vec_data = vec_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + tmat_data[i * width + j] += vec_data[index]; + } } } +}; + +template +void MatrixBitCodeFunctor::Add(const framework::Tensor &vec, + framework::Tensor *tmat) { + MatrixBitCodeFunctorAdd func(vec, tmat); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::Tensor* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - vec->data()[index] += tmat.data()[i * width + j]; +struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *vec_; + MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat, + framework::Tensor *vec) + : tmat_(tmat), vec_(vec) {} + + template + void operator()(const CodeTable &table) { + size_t batch_size = tmat_.dims()[0]; + size_t width = tmat_.dims()[1]; + auto *vec_data = vec_->data(); + auto *tmat_data = tmat_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + vec_data[index] += tmat_data[i * width + j]; + } } } +}; + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, + framework::Tensor *vec) { + MatrixBitCodeFunctorAddGrad func(tmat, vec); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::SelectedRows* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec->mutable_value()->data()[row_index] += - tmat.data()[i * width + j]; +struct MatrixBitCodeFunctorSelectedRowsAddGrad + : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::SelectedRows *vec_; + + MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, + framework::SelectedRows *vec) + : tmat_(tmat), vec_(vec) {} + + template + void operator()(const CodeTable &code_table) { + size_t batch_size = tmat_.dims()[0]; + size_t width = tmat_.dims()[1]; + auto *vec_data = vec_->mutable_value()->template data(); + auto *tmat_data = tmat_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + int64_t row_index = vec_->GetIndexFromId(static_cast(index)); + vec_data[row_index] += tmat_data[i * width + j]; + } } } +}; + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, + framework::SelectedRows *vec) { + MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, - framework::Tensor* sum, T scale_sum) { - size_t num_samples = tmat.dims()[0]; - size_t o_width = tmat.dims()[1]; - for (size_t i = 0; i < num_samples; ++i) { - T sm = static_cast(0.0); - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - if (code->calc_bit(j)) { - // calc_bit starts from right most bit, while data in tmat[i] is in the - // reverse order. - sm += tmat.data()[i * o_width + j]; +struct MatrixBitCodeFunctorSum : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *sum_; + T scale_sum_; + + MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum, + T scale_sum) + : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {} + + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_.dims()[0]; + size_t o_width = tmat_.dims()[1]; + auto *tmat_data = tmat_.data(); + auto *sum_data = sum_->data(); + for (size_t i = 0; i < num_samples; ++i) { + T sm = static_cast(0.0); + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + // calc_bit starts from right most bit, while data in tmat[i] is in + // the + // reverse order. + sm += tmat_data[i * o_width + j]; + } } + sum_data[i] = scale_sum_ * sm; } - sum->data()[i] = scale_sum * sm; } +}; + +template +void MatrixBitCodeFunctor::Sum(const framework::Tensor &tmat, + framework::Tensor *sum, T scale_sum) { + MatrixBitCodeFunctorSum func(tmat, sum, scale_sum); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, - const framework::Tensor& weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat->dims()[0]; - size_t tmat_width = tmat->dims()[1]; - size_t input_width = input.dims()[1]; - size_t weight_width = weight.dims()[1]; - auto tmat_value = tmat->data(); - auto weight_value = weight.data(); - auto input_value = input.data(); - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_row = input_value + input_width * i; - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - const T* weight_row = weight_value + weight_width * index; - T sum = static_cast(0.0); - sum = blas.DOT(input_width, weight_row, input_row); - tmat_value[i * tmat_width + j] += sum; +struct MatrixBitCodeFunctorMul : public boost::static_visitor { + framework::Tensor *tmat_; + const framework::Tensor &weight_; + const framework::Tensor &input_; + + MatrixBitCodeFunctorMul(framework::Tensor *tmat, + const framework::Tensor &weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_->dims()[0]; + size_t tmat_width = tmat_->dims()[1]; + size_t input_width = input_.dims()[1]; + size_t weight_width = weight_.dims()[1]; + auto tmat_value = tmat_->data(); + auto weight_value = weight_.data(); + auto input_value = input_.data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_row = input_value + input_width * i; + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + const T *weight_row = weight_value + weight_width * index; + T sum = blas.DOT(input_width, weight_row, input_row); + tmat_value[i * tmat_width + j] += sum; + } } } +}; + +template +void MatrixBitCodeFunctor::Mul(framework::Tensor *tmat, + const framework::Tensor &weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMul func(tmat, weight, input); + code_table_.apply_visitor(func); } +template +class ReservedVector : public std::vector { + public: + ReservedVector() { this->reserve(N); } +}; + template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::Tensor* weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat.dims()[0]; - size_t input_width = input.dims()[1]; - size_t tmat_width = tmat.dims()[1]; - size_t weight_width = weight->dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight->data(); - auto input_value = input.data(); - - std::unordered_map>> ops; - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_value_row = input_value + input_width * i; - const T* tmat_row = tmat_value + i * tmat_width; - for (int j = 0; j < code_length; ++j) { - ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); +struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *weight_; + const framework::Tensor &input_; + MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat, + framework::Tensor *weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_.dims()[0]; + size_t input_width = input_.dims()[1]; + size_t tmat_width = tmat_.dims()[1]; + size_t weight_width = weight_->dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_->data(); + auto input_value = input_.data(); + + std::map, 8u>> ops; + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_value_row = input_value + input_width * i; + const T *tmat_row = tmat_value + i * tmat_width; + for (int j = 0; j < code_length; ++j) { + ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } } - } - for (auto& op : ops) { - auto& op_in_row = op.second; - for (auto& pair : op_in_row) { - auto& scale = pair.first; - auto* input_row = pair.second; - T* weight_row = weight_value + op.first * weight_width; - blas.AXPY(input_width, scale, input_row, weight_row); + for (auto &op : ops) { + auto &op_in_row = op.second; + for (auto &pair : op_in_row) { + auto &scale = pair.first; + auto *input_row = pair.second; + T *weight_row = weight_value + op.first * weight_width; + blas.AXPY(input_width, scale, input_row, weight_row); + } } } +}; + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, + framework::Tensor *weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMulGradWeight func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::SelectedRows* weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat.dims()[0]; - size_t input_width = input.dims()[1]; - size_t tmat_width = tmat.dims()[1]; - size_t weight_width = weight->value().dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight->mutable_value()->data(); - auto input_value = input.data(); - - std::unordered_map>> ops; - ops.reserve(weight->rows().size()); - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_value_row = input_value + input_width * i; - const T* tmat_row = tmat_value + i * tmat_width; - for (int j = 0; j < code_length; ++j) { - ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); +struct MatrixBitCodeFunctorMulGradWeightSR + : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::SelectedRows *weight_; + const framework::Tensor &input_; + + MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat, + framework::SelectedRows *weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_.dims()[0]; + size_t input_width = input_.dims()[1]; + size_t tmat_width = tmat_.dims()[1]; + size_t weight_width = weight_->value().dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_->mutable_value()->data(); + auto input_value = input_.data(); + + std::unordered_map>> ops; + ops.reserve(weight_->rows().size()); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_value_row = input_value + input_width * i; + const T *tmat_row = tmat_value + i * tmat_width; + for (int j = 0; j < code_length; ++j) { + ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } } - } - for (auto& row : weight->rows()) { - auto& op_in_row = ops[row]; - for (auto& pair : op_in_row) { - auto& scale = pair.first; - auto* input_row = pair.second; - blas.AXPY(input_width, scale, input_row, weight_value); + for (auto &row : weight_->rows()) { + auto &op_in_row = ops[row]; + for (auto &pair : op_in_row) { + auto &scale = pair.first; + auto *input_row = pair.second; + blas.AXPY(input_width, scale, input_row, weight_value); + } + weight_value += weight_width; } - weight_value += weight_width; } +}; + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, + framework::SelectedRows *weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMulGradWeightSR func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, - const framework::Tensor& weight, - framework::Tensor* input) { - size_t num_samples = tmat.dims()[0]; - size_t tmat_width = tmat.dims()[1]; - size_t input_width = input->dims()[1]; - size_t weight_width = weight.dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight.data(); - auto input_value = input->data(); - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - - for (size_t k = 0; k < input_width; ++k) { - input_value[input_width * i + k] += - tmat_value[i * tmat_width + j] * - weight_value[weight_width * index + k]; +struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor { + const framework::Tensor &tmat_; + const framework::Tensor &weight_; + framework::Tensor *input_; + + MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat, + const framework::Tensor &weight, + framework::Tensor *input) + : tmat_(tmat), weight_(weight), input_(input) {} + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_.dims()[0]; + size_t tmat_width = tmat_.dims()[1]; + size_t input_width = input_->dims()[1]; + size_t weight_width = weight_.dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_.data(); + auto input_value = input_->data(); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + + for (size_t k = 0; k < input_width; ++k) { + input_value[input_width * i + k] += + tmat_value[i * tmat_width + j] * + weight_value[weight_width * index + k]; + } } } } +}; + +template +void MatrixBitCodeFunctor::MulGradError(const framework::Tensor &tmat, + const framework::Tensor &weight, + framework::Tensor *input) { + MatrixBitCodeFunctorMulGradError func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { - size_t num_samples = tmat->dims()[0]; - size_t o_width = tmat->dims()[1]; - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - if (code->calc_bit(j)) { - tmat->data()[i * o_width + j] -= 1; +struct MatrixBitCodeFunctorSub : public boost::static_visitor { + framework::Tensor *tmat_; + + explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {} + + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_->dims()[0]; + size_t o_width = tmat_->dims()[1]; + auto *tmat_data = tmat_->data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + tmat_data[i * o_width + j] -= 1; + } } } } +}; + +template +void MatrixBitCodeFunctor::Sub(framework::Tensor *tmat) { + MatrixBitCodeFunctorSub func(tmat); + code_table_.apply_visitor(func); } template class MatrixBitCodeFunctor; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 35ca73802b48982ddf3ed7485b56f50221c9f28c..01e4889d34ad6e409f1b8a9c4bf783800187e863 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include @@ -22,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/variant.h" #if defined(_WIN32) #include @@ -98,24 +100,7 @@ inline int clz(const T& value) { inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } #endif // !_WIN32 -// set a code interface to create multiple code -class Code { - public: - virtual ~Code() {} - virtual size_t calc_index(int bit) const = 0; - virtual bool calc_bit(int bit) const = 0; - virtual int get_length() const = 0; -}; -// set a CodeTable interface to create multiple code table -class CodeTable { - public: - virtual std::unique_ptr get_code(int64_t code) const = 0; - virtual size_t size() const = 0; - virtual int get_max_code_length() const = 0; - virtual ~CodeTable() {} -}; - -class SimpleCode : public Code { +class SimpleCode { public: SimpleCode(size_t code, size_t num_classes, const int64_t* ids) : c_(static_cast(ids[code]) + num_classes) {} @@ -137,16 +122,16 @@ class SimpleCode : public Code { }; template -class CustomCode : public Code { +class CustomCode { public: CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) - : ids_(ids), index_(index) { - ptable_ = ptable.Slice(index, index + 1); - pcode_ = pcode.Slice(index, index + 1); + const int64_t* ids, int index) { + seq_len_ = ptable.dims()[1]; + ptable_data_ = ptable.data() + seq_len_ * index; + pcode_data_ = pcode.data() + seq_len_ * index; } /** - * Here the id of root shoud be 1 rather than 0, thus the encoding of class c + * Here the id of root should be 1 rather than 0, thus the encoding of class c * is `c + num_classes` and all siblings can get the same weight indice using * prefixes. * Weight index is the prefixes of encoding, thus leave out the right most @@ -154,36 +139,37 @@ class CustomCode : public Code { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_.data()[bit]; } - bool calc_bit(int bit) const { return pcode_.data()[bit]; } - int get_length() const { - int length = 0; + size_t calc_index(int bit) const { return ptable_data_[bit]; } + bool calc_bit(int bit) const { return pcode_data_[bit]; } - for (int i = 0; i < static_cast(ptable_.dims()[1]); i++) { - if (ptable_.data()[i] >= 0) { - length++; - } else { - return length; - } + // NOTE: this function is not thread-safe. + int get_length() const { + if (length_ < 0) { + auto len = seq_len_; + length_ = + static_cast(std::find_if(ptable_data_, ptable_data_ + len, + [](const T& val) { return val < 0; }) - + ptable_data_); } - return length; + return length_; } private: - framework::Tensor ptable_; - framework::Tensor pcode_; - const int64_t* ids_; - const int index_; + int64_t seq_len_; + const T* ptable_data_; + const T* pcode_data_; + mutable int length_{-1}; }; -class SimpleCodeTable : public CodeTable { +class SimpleCodeTable { public: SimpleCodeTable(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids) {} - std::unique_ptr get_code(int64_t code) const { - std::unique_ptr coder(new SimpleCode(code, num_classes_, ids_)); - return coder; + + SimpleCode get_code(int64_t code) const { + return SimpleCode(code, num_classes_, ids_); } + size_t size() const { return num_classes_; } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } @@ -193,15 +179,14 @@ class SimpleCodeTable : public CodeTable { }; template -class CustomCodeTable : public CodeTable { +class CustomCodeTable { public: CustomCodeTable(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids) : ptable_(ptable), pcode_(pcode), ids_(ids) {} - std::unique_ptr get_code(int64_t code) const { - std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); - return coder; + CustomCode get_code(int64_t code) const { + return CustomCode(ptable_, pcode_, ids_, code); } size_t size() const { return static_cast(ptable_.dims()[1]); } @@ -215,19 +200,21 @@ class CustomCodeTable : public CodeTable { const int64_t* ids_; }; +using CodeTable = boost::variant>; + template class MatrixBitCodeFunctor { public: MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids), - code_table_(new SimpleCodeTable(num_classes, ids)) {} + code_table_(SimpleCodeTable(num_classes, ids)) {} MatrixBitCodeFunctor(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids) : num_classes_(static_cast(ptable.dims()[1])), ids_(ids), - code_table_(new CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(ptable, pcode, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -277,7 +264,7 @@ class MatrixBitCodeFunctor { size_t num_classes_; const int64_t* ids_; - std::unique_ptr code_table_; + CodeTable code_table_; }; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt index eee0b90fbae216e804e62993313796e914fcef5a..6b551d13f1dc5cd1c82a15a8347b278e8f795c1c 100644 --- a/paddle/fluid/operators/tensorrt/CMakeLists.txt +++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt @@ -1,5 +1,5 @@ op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) -file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") +file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n") nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 3cf2ce3c7ef87dcf75548f7d9c3a55d06ed765e8..b993c55fad13e892efd51648b78704bec83bf2b4 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -21,8 +21,6 @@ namespace paddle { -DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); - namespace operators { class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { @@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); - AddAttr("engine_uniq_key", "unique key for the TRT engine."); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); AddComment("TensorRT engine operator."); @@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc deleted file mode 100644 index cbe1b426f65386e722a7b02ec1fdfdf75bfd770c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - tensorrt_engine, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 5b2aad55a4e4b640c614ad7639c83b49cef3dc07..88c4f508474e66953b79fb92ff1eb0b53a539f07 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -27,8 +27,6 @@ namespace paddle { -DECLARE_int32(tensorrt_engine_batch_size); - namespace operators { using FluidDT = framework::proto::VarType_Type; @@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kINT32; } -nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { +nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { PADDLE_ENFORCE_GT(shape.size(), 1UL, "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, @@ -63,128 +61,119 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { } // namespace // NOLINT using inference::Singleton; -using inference::tensorrt::TRT_EngineManager; +using inference::tensorrt::TensorRTEngine; + +class TensorRTEngineOp : public framework::OperatorBase { + private: + std::vector input_names_; + std::unordered_set param_names_; + mutable std::unique_ptr trt_engine_; + int max_batch_size_; + int workspace_size_; -class TensorRTEngineOp : public framework::OperatorWithKernel { public: - using framework::OperatorWithKernel::OperatorWithKernel; + TensorRTEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + input_names_ = Inputs("Xs"); + max_batch_size_ = Attr("max_batch_size"); + workspace_size_ = Attr("workspace_size"); + + auto params = Attr>("parameters"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + } protected: - void InferShape(framework::InferShapeContext* ctx) const override {} - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input0 = ctx.Inputs("Xs").front(); - framework::OpKernelType kt = framework::OpKernelType( - ctx.scope().FindVar(input0)->GetMutable()->type(), - ctx.GetPlace()); - return kt; + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunTrt(scope, dev_place); } -}; -template -class TensorRTEngineKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto engine_name = context.Attr("engine_uniq_key"); - int max_batch_size = context.Attr("max_batch_size"); - if (!Singleton::Global().HasEngine(engine_name)) { - Prepare(context); + void RunTrt(const framework::Scope &scope, + const platform::Place &dev_place) const { + int runtime_batch = 1; + if (trt_engine_.get() == nullptr) { + trt_engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, nullptr, + boost::get(dev_place).device)); + Prepare(scope, dev_place, trt_engine_.get()); } - auto* engine = Singleton::Global().Get(engine_name); - auto input_names = context.op().Inputs("Xs"); - PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); + + auto *engine = trt_engine_.get(); + PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = - context.Attr>("output_name_mapping"); + Attr>("output_name_mapping"); - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } // Convert input tensor from fluid to engine. - for (const auto& x : context.Inputs("Xs")) { - if (parameters.count(x)) continue; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer - auto& t = inference::analysis::GetFromScope( - context.scope(), x); + auto &t = + inference::analysis::GetFromScope(scope, x); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), + engine->SetInputFromCPU(x, static_cast(t.data()), t.memory_size()); } else { - engine->SetInputFromGPU(x, static_cast(t.data()), + engine->SetInputFromGPU(x, static_cast(t.data()), t.memory_size()); } } + + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. - PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); - engine->Execute(FLAGS_tensorrt_engine_batch_size); + engine->Execute(runtime_batch); // Convert output tensor from engine to fluid int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; - for (const auto& y : context.Outputs("Ys")) { + for (const auto &y : Outputs("Ys")) { VLOG(4) << y; // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); + nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; - ddim.push_back(FLAGS_tensorrt_engine_batch_size); + ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto* fluid_v = context.scope().FindVar(y); + auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); - auto* fluid_t = fluid_v->GetMutable(); + auto *fluid_t = fluid_v->GetMutable(); fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) find some way to determine which device to output the - // tensor. - // if (platform::is_cpu_place(fluid_t->place())) { // TODO(Superjomn) change this float to dtype size. - auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * - FLAGS_tensorrt_engine_batch_size; + auto size = + inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; engine->GetOutputInGPU( output_maps[output_index], fluid_t->mutable_data(platform::CUDAPlace( - boost::get(context.GetPlace()).device)), + boost::get(dev_place).device)), size * sizeof(float)); - output_index += 1; } cudaStreamSynchronize(*engine->stream()); } - protected: - void Prepare(const framework::ExecutionContext& context) const { + void Prepare(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { VLOG(4) << "Prepare engine"; - // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch_size = context.Attr("max_batch_size"); - int workspace_size = context.Attr("workspace_size"); - - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } + block_desc.ParseFromString(Attr("subgraph")); std::vector output_maps = - context.Attr>("output_name_mapping"); - - // TODO(Superjomn) replace this with a different stream - auto* engine = Singleton::Global().Create( - max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, - context.Attr("engine_uniq_key"), - boost::get(context.GetPlace()).device); + Attr>("output_name_mapping"); engine->InitNetwork(); @@ -192,39 +181,33 @@ class TensorRTEngineKernel : public framework::OpKernel { VLOG(4) << "parsed var size " << block.AllVars().size(); // Add inputs VLOG(4) << "declare inputs"; - for (auto& input : context.Inputs("Xs")) { - if (parameters.count(input)) continue; + for (auto &input : Inputs("Xs")) { + if (param_names_.count(input)) continue; VLOG(4) << "declare input " << input; - auto* var = block.FindVar(input); + + auto &t = + inference::analysis::GetFromScope(scope, input); + auto t_shape = framework::vectorize(t.dims()); + + auto *var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description // should be set in PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); - auto shape = var->GetShape(); - // For the special batch_size placeholder -1, drop it and pass the real - // shape of data. - // TODO(Superjomn) fix this with batch broadcast, or it can't handle - // variational batch size. - if (shape[0] == -1) { - shape[0] = FLAGS_tensorrt_engine_batch_size; - } + engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(shape)); + Vec2TRT_Dims(t_shape)); } - inference::Singleton::Global() - .ConvertBlock(block_desc, parameters, context.scope(), engine); + .ConvertBlock(block_desc, param_names_, scope, engine); // Add outputs - for (auto& output : output_maps) { - if (!engine->HasDeclared(output)) { - engine->DeclareOutput(output); - } + for (auto &output : output_maps) { + engine->DeclareOutput(output); } - engine->FreezeNetwork(); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 56bdd6c2f2801967829f2baf889b5517a1d9d8d9..287b0edc96e5e312b0ff1725ee188ff319d44d23 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -24,8 +24,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_CUDA_ONLY_OP(tensorrt_engine); - +USE_NO_KERNEL_OP(tensorrt_engine); namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 23c7ebe84221986a5f7ac7583c3a8e17d04fe4af..2f205e1d5ca30d67a55e4df0f5e879ffef9a9c26 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -67,6 +67,13 @@ ENDIF() # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) +if(WIN32) + if(WITH_GPU AND NOT WITH_DSO) + get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) + target_link_libraries(device_context ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) +endif(WIN32) + nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) cc_test(init_test SRCS init_test.cc DEPS device_context) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 55dba545ff133b1c219ee58f6d1bb2d2130d1a59..c70e3be858fe72f298a5e553bcca189641392cdc 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -16,6 +16,26 @@ limitations under the License. */ #include +#ifdef _WIN32 +#if defined(__AVX2__) +#include //avx2 +#elif defined(__AVX__) +#include //avx +#endif // AVX +#else // WIN32 +#ifdef __AVX__ +#include +#endif +#endif // WIN32 + +#if defined(_WIN32) +#define ALIGN32_BEG __declspec(align(32)) +#define ALIGN32_END +#else +#define ALIGN32_BEG +#define ALIGN32_END __attribute__((aligned(32))) +#endif // _WIN32 + namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index f0a973662360fd9ff35e1006cce937d86f3e563c..c3f9433503accf98d30ccaa57b9b4b8f3c68666a 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -82,6 +82,8 @@ extern void* mklml_dso_handle; __macro(vdSqr); \ __macro(vsPowx); \ __macro(vdPowx); \ + __macro(vsInv); \ + __macro(vdInv); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 466c77469ef256179c52442d21c1d62dfc4ef1bb..5a9e24374f6f777c2286b8928eae9dcbe8be6378 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -19,7 +19,7 @@ namespace paddle { namespace platform { #if CUDA_VERSION >= 10000 -static void CUDART_CB StreamCallbackFunc(void *user_data); +static void CUDART_CB StreamCallbackFunc(void *user_data) #else static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data) diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb12628d1f4f333956e0213ddf9c01e592c..c79d5d9403db613a8cdda59b9874a8b886458357 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -19,10 +19,6 @@ if(WITH_PYTHON) endif(WITH_AMD_GPU) if(WIN32) - if(WITH_GPU AND NOT WITH_DSO) - get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) - target_link_libraries(paddle_pybind ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) target_link_libraries(paddle_pybind shlwapi) endif(WIN32) diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 470e8b050808295d49728bbdb757b6a612df9a01..222c128c66f37a259eb17527fe2586860f701275 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -41,6 +41,23 @@ namespace pd = paddle::framework; namespace paddle { namespace pybind { using set_name_func = void (pd::DataFeedDesc::*)(const std::string&); +#ifdef PADDLE_WITH_PSLIB +void BindAsyncExecutor(py::module* m) { + py::class_(*m, "AsyncExecutor") + .def(py::init([](framework::Scope* scope, const platform::Place& place) { + return std::unique_ptr( + new framework::AsyncExecutor(scope, place)); + })) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile) + .def("init_server", &framework::AsyncExecutor::InitServer) + .def("init_worker", &framework::AsyncExecutor::InitWorker) + .def("start_server", &framework::AsyncExecutor::StartServer) + .def("stop_server", &framework::AsyncExecutor::StopServer) + .def("gather_servers", &framework::AsyncExecutor::GatherServers) + .def("init_model", &framework::AsyncExecutor::InitModel) + .def("save_model", &framework::AsyncExecutor::SaveModel); +} // end BindAsyncExecutor +#else void BindAsyncExecutor(py::module* m) { py::class_(*m, "AsyncExecutor") .def(py::init([](framework::Scope* scope, const platform::Place& place) { @@ -49,5 +66,6 @@ void BindAsyncExecutor(py::module* m) { })) .def("run_from_files", &framework::AsyncExecutor::RunFromFile); } // end BindAsyncExecutor +#endif } // end namespace pybind } // end namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 017598e1707ffdde1396e7d2bb0bbcbdfa547e71..737ae2dd9c3451a0c9aabd31a5ec05b908356c98 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "memory_optimize", + [](const BuildStrategy &self) { return self.memory_optimize_; }, + [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "memory_early_delete", + [](const BuildStrategy &self) { return self.memory_early_delete_; }, + [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 48263a1f0580f697a8b4edbcf9fe60e0bb15eba9..2a4428eb705d9b19bf8d1f2970f54a503feef7b0 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -151,7 +151,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'selected_gpus' + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2664a7301db3bf471126ff26504e7042f02b7d84..4ca6a5170eb57b0d799159b7ecc55c2389246041 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -24,6 +24,8 @@ from paddle.fluid.proto import data_feed_pb2 from google.protobuf import text_format from . import io from .data_feed_desc import DataFeedDesc +from .distributed import ps_instance +from .contrib.utils import hdfs_utils as hdfs __all__ = ['AsyncExecutor'] @@ -74,7 +76,7 @@ class AsyncExecutor(object): Note: Only running on CPUPlace supported. """ - def __init__(self, place=None): + def __init__(self, place=None, run_mode=""): if place is None: place = core.CPUPlace() if not isinstance(place, core.CPUPlace): @@ -85,8 +87,16 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) + self.instance = None - def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): + def run(self, + program, + data_feed, + filelist, + thread_num, + fetch, + mode="", + debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. Users can also inspect certain variables by naming them in parameter @@ -106,6 +116,7 @@ class AsyncExecutor(object): thread_num(int): number of concurrent training threads. See :code:`Note` for how to set this properly fetch(str|list): the var name or a list of var names to inspect + mode(str): run mode of this interface debug(bool): When set to True, fetch vars will be printed to standard output after each minibatch @@ -148,4 +159,152 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, - fetch_var_names, debug) + fetch_var_names, mode, debug) + + def download_data(self, + afs_path, + local_path, + fs_default_name, + ugi, + file_cnt, + hadoop_home="$HADOOP_HOME", + process_num=12): + """ + download_data is a default download method for distributed training + a user download data without this method + + Example: + >>> exe = fluid.AsyncExecutor() + >>> exe.download_data("/xxx/xxx/xx/", + >>> "./data", "afs:// + >>> xxx.xxx.xxx.xxx:9901", "xxx,yyy") + Args: + afs_path(str): afs_path defined by users + local_path(str): download data path + fs_default_name(str): file system server address + ugi(str): hadoop ugi + file_cn(int): a user can specify file number for debugging + hadoop_home(str): hadoop home path + process_num(int): download process num + """ + if self.instance is None: + raise ValueError('instance is None, please run' + 'config_distributed_nodes init instance') + + configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi} + + client = hdfs.HDFSClient(hadoop_home, configs) + downloads = hdfs.multi_download( + client, + afs_path, + local_path, + self.instance.get_worker_index(), + self.instance.get_node_cnt() / 2, + file_cnt, + multi_processes=process_num) + self.instance.barrier_worker() #wait for download_data + + def get_instance(self): + """ + get current node's instance so that user can do operations + in distributed setting + """ + if self.instance is None: + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + return self.instance + + def config_distributed_nodes(self): + """ + if a user needs to run distributed async executor + he or she needs to do a global configuration so that + information of current process can be obtained + """ + self.instance = ps_instance.PaddlePSInstance(1, 2) + return self.instance + + def stop(self): + """ + at the end of process, users should call stop to servers + and barrier all workers + """ + if self.instance is None: + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + self.instance.barrier_worker() #worker do all things + if self.instance.is_first_worker(): + self.executor.stop_server() + self.instance.barrier_worker() #sync + self.instance.barrier_all() + self.instance.finalize() + + def init_server(self, dist_desc): + """ + initialize server of current node if current process is a server + Args: + dist_desc(str): a protobuf string that describes + how to init a worker and a server + """ + if self.instance is None: + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + self.executor.init_server(dist_desc, self.instance._rankid) + ip = self.executor.start_server() + self.instance.set_ip(ip) + self.instance.barrier_all() #wait all server start + ips = self.instance.gather_ips() + self.executor.gather_servers(ips, self.instance.get_node_cnt()) + self.instance.barrier_all() #wait all worker start + + def init_worker(self, dist_desc, startup_program): + """ + initialize worker of current node if current process is a worker + Args: + dist_desc(str): a protobuf string that describes + how to init a worker and a server + startup_program(fluid.Program): startup program of current process + """ + if self.instance is None: + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + place = core.CPUPlace() + executor = Executor(place) + executor.run(startup_program) + + self.instance.barrier_all() #wait all server start + ips = self.instance.gather_ips() + self.executor.init_worker(dist_desc, ips, + self.instance.get_node_cnt(), + self.instance._rankid) + self.instance.barrier_all() #wait all worker start + if self.instance.is_first_worker(): + self.executor.init_model() + self.instance.barrier_worker() #wait init model + + def init_model(self): + """ + init_model command that can be invoked from one of the worker + model parameters are initialized in servers + """ + if self.instance is None: + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + self.executor.init_model() + + def save_model(self, save_path): + """ + save_model command that can be invoked from one of the worker + model parameters are saved in servers and upload to save_path of file system + Args: + save_path(str): save path to file system + """ + if self.instance is None: + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + self.executor.save_model(save_path) diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py index 6e479bdc2b93c1189ba07a6f20b2408c34110b93..20b2cc381aaa1b837ce106410246bc8cedb2fc88 100644 --- a/python/paddle/fluid/contrib/utils/__init__.py +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -13,10 +13,10 @@ # limitations under the License. from __future__ import print_function -from . import lookup_table_utils -from .lookup_table_utils import * +#from . import lookup_table_utils +#from .lookup_table_utils import * from . import hdfs_utils from .hdfs_utils import * -__all__ = lookup_table_utils.__all__ +#__all__ = lookup_table_utils.__all__ __all__ = hdfs_utils.__all__ diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index 251665d85e166f4ebf66eced7a5889ee9fc23e08..baea57ccce0e9ca3a8fab244e43a107a89cfe67d 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -32,6 +32,28 @@ _logger.setLevel(logging.INFO) class HDFSClient(object): + """ + A tool of HDFS + + Args: + hadoop_home (string): hadoop_home + configs (dict): hadoop config, it is a dict, please contain \ + key "fs.default.name" and "hadoop.job.ugi" + Can be a float value + Examples: + hadoop_home = "/home/client/hadoop-client/hadoop/" + + configs = { + "fs.default.name": "hdfs://xxx.hadoop.com:54310", + "hadoop.job.ugi": "hello,hello123" + } + + client = HDFSClient(hadoop_home, configs) + + client.ls("/user/com/train-25") + files = client.lsr("/user/com/train-25/models") + """ + def __init__(self, hadoop_home, configs): self.pre_commands = [] hadoop_bin = '%s/bin/hadoop' % hadoop_home @@ -52,9 +74,13 @@ class HDFSClient(object): ret_code = 0 ret_out = None ret_err = None + whole_commands = " ".join(whole_commands) for x in range(retry_times + 1): proc = subprocess.Popen( - whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + whole_commands, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) (output, errors) = proc.communicate() ret_code, ret_out, ret_err = proc.returncode, output, errors if ret_code: @@ -68,10 +94,12 @@ class HDFSClient(object): def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): """ upload the local file to hdfs - args: - local_file_path: the local file path - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) - return: + Args: + hdfs_path: hdfs path, target path + local_path: local file path, source path + overwrite: will overwrite the original file + retry_times: max times retry to upload + Returns: True or False """ assert hdfs_path is not None @@ -114,10 +142,12 @@ class HDFSClient(object): def download(self, hdfs_path, local_path, overwrite=False, unzip=False): """ download from hdfs - args: - local_file_path: the local file path - remote_file_path: remote dir on hdfs - return: + Args: + hdfs_path: hdfs path, target path + local_path: local file path, source path + overwrite: will remove original file and overwrite it. + unzip: ignore this param + Returns True or False """ _logger.info('Downloading %r to %r.', hdfs_path, local_path) @@ -159,11 +189,11 @@ class HDFSClient(object): def is_exist(self, hdfs_path=None): """ whether the remote hdfs path exists? - args: - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) + Args: + hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) fs_name: The default values are the same as in the job configuration fs_ugi: The default values are the same as in the job configuration - return: + Returns: True or False """ exist_cmd = ['-test', '-e', hdfs_path] @@ -182,11 +212,11 @@ class HDFSClient(object): def is_dir(self, hdfs_path=None): """ whether the remote hdfs path exists? - args: + Args: remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) fs_name: The default values are the same as in the job configuration fs_ugi: The default values are the same as in the job configuration - return: + Returns: True or False """ @@ -206,15 +236,17 @@ class HDFSClient(object): return True def delete(self, hdfs_path): - """Remove a file or directory from HDFS. - - :param hdfs_path: HDFS path. - :param recursive: Recursively delete files and directories. By default, - this method will raise an :class:`HdfsError` if trying to delete a - non-empty directory. + """ + Remove a file or directory from HDFS. - This function returns `True` if the deletion was successful and `False` if - no file or directory previously existed at `hdfs_path`. + Args: + param hdfs_path: HDFS path. + param recursive: Recursively delete files and directories. By default, + this method will raise an :class:`HdfsError` if trying to delete a + non-empty directory. + Returns: + This function returns `True` if the deletion was successful and `False` if + no file or directory previously existed at `hdfs_path`. """ _logger.info('Deleting %r.', hdfs_path) @@ -240,14 +272,17 @@ class HDFSClient(object): return True def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): - """Move a file or folder. - - :param hdfs_src_path: Source path. - :param hdfs_dst_path: Destination path. If the path already exists and is - a directory, the source will be moved into it. If the path exists and is - a file, or if a parent destination directory is missing, this method will - raise an :class:`HdfsError`. - + """ + Rename a file or folder. + Args: + :param hdfs_src_path: Source path. + :param hdfs_dst_path: Destination path. If the path already exists and is + a directory, the source will be moved into it. If the path exists and is + a file, or if a parent destination directory is missing, this method will + raise an :class:`HdfsError`. + Returns: + This function returns `True` if the rename was successful and `False` if + rename was faild. """ assert hdfs_src_path is not None assert hdfs_dst_path is not None @@ -273,6 +308,11 @@ class HDFSClient(object): @staticmethod def make_local_dirs(local_path): + """ + create a directiory local, is same to mkdir + Args: + local_path: local path that wants to create a directiory. + """ try: os.makedirs(local_path) except OSError as e: @@ -281,9 +321,11 @@ class HDFSClient(object): def makedirs(self, hdfs_path): """Create a remote directory, recursively if necessary. - - :param hdfs_path: Remote path. Intermediate directories will be created - appropriately. + Args: + :param hdfs_path: Remote path. Intermediate directories will be created + appropriately. + Returns: + True if make a directories was successful, False when make a directiries was failed. """ _logger.info('Creating directories to %r.', hdfs_path) assert hdfs_path is not None @@ -303,6 +345,13 @@ class HDFSClient(object): return True def ls(self, hdfs_path): + """ + ls a hdfs_path. + Args: + :param hdfs_path: hdfs_path will be ls. + Returns: + This function returns a `list` that contaion all files in the hdfs_path. + """ assert hdfs_path is not None if not self.is_exist(hdfs_path): @@ -328,6 +377,14 @@ class HDFSClient(object): return ret_lines def lsr(self, hdfs_path, only_file=True, sort=True): + """ + ls a hdfs_path sort by time. + Args: + :param hdfs_path: hdfs_path will be ls. + Returns: + This function returns a `list` that contaion all files sorted by time in the hdfs_path. + """ + def sort_by_time(v1, v2): v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M') v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M') @@ -371,12 +428,15 @@ def multi_upload(client, multi_processes=5, overwrite=False): """ - :param overwrite: will overwrite hdfs file or not - :param multi_processes: the upload data process at the same time, default=5 - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :return: + Upload file to hdfs. + Args: + :param overwrite: will overwrite hdfs file or not + :param multi_processes: the upload data process at the same time, default=5 + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + Returns: + """ def __subprocess_upload(datas): @@ -386,6 +446,13 @@ def multi_upload(client, client.upload(hdfs_re_path, data, overwrite, retry_times=5) def get_local_files(path): + """ + Get all local files + Args: + path: local file path + Returns: + A list that contation all files in the path. + """ rlist = [] if not os.path.isdir(path): @@ -426,16 +493,21 @@ def multi_download(client, local_path, trainer_id, trainers, + file_cnt, multi_processes=5): """ multi_download - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :param trainer_id: current trainer id - :param trainers: all trainers number - :param multi_processes: the download data process at the same time, default=5 - :return: None + Args: + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + :param trainer_id: current trainer id + :param trainers: all trainers number + :param file_cnt: all file number + :param multi_processes: the download data process at the same time, default=5 + :return: None + Returns: + A list that be downloaded. """ def __subprocess_download(datas): @@ -449,7 +521,7 @@ def multi_download(client, client.make_local_dirs(local_path) _logger.info("Make local dir {} successfully".format(local_path)) - all_need_download = client.lsr(hdfs_path, sort=True) + all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt] need_download = all_need_download[trainer_id::trainers] _logger.info("Get {} files From all {} files need to be download from {}". format(len(need_download), len(all_need_download), hdfs_path)) @@ -500,6 +572,7 @@ if __name__ == "__main__": "/home/xx/data1", 1, 5, + 100, multi_processes=5) multi_upload(client, "/user/com/train-25/model", "/home/xx/data1") diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index 52d9ce75f8d73eb3c3e8683bc0793e9dd8fbe48d..74824f6832442d5090e0cea2962ca2f68b7a0181 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -15,12 +15,52 @@ LOOKUP_TABLE_TYPE = "lookup_table" +def find_distributed_lookup_table_inputs(program, table_name): + """ + Find input variable of distribute lookup table in program. + We only support one distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + inputs + """ + local_vars = program.current_block().vars + inputs = [] + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if table_name == op.input("W")[0]: + inputs.extend([local_vars[name] for name in op.input("Ids")]) + return inputs + + +def find_distributed_lookup_table_outputs(program, table_name): + """ + Find output variable of distribute lookup table in program. + We only support one distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + outputs + """ + local_vars = program.current_block().vars + outputs = [] + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if table_name == op.input("W")[0]: + outputs.extend([local_vars[name] for name in op.output("Out")]) + return outputs + + def find_distributed_lookup_table(program): """ Find distribute lookup table in program. We only support one distribute table now. - :param program: - :return: table_name or None + Args: + program(Program): given program, locate distributed lookup table + Returns: + table_name or None """ table_name = None diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd609c504078b907221a689fbb4e910ec8d54270 --- /dev/null +++ b/python/paddle/fluid/distributed/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py new file mode 100644 index 0000000000000000000000000000000000000000..87dfab92c53d9950d4606e078cc9f51bcda8f4d3 --- /dev/null +++ b/python/paddle/fluid/distributed/downpour.py @@ -0,0 +1,105 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from .node import DownpourServer +from .node import DownpourWorker +from ..backward import append_backward +import ps_pb2 as pslib +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs +from google.protobuf import text_format + + +class DownpourSGD(object): + """ + Distributed optimizer of downpour stochastic gradient descent + Standard implementation of Google's Downpour SGD + in Large Scale Distributed Deep Networks + + Args: + learning_rate (float): the learning rate used to update parameters. \ + Can be a float value + Examples: + .. code-block:: python + + downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2) + downpour_sgd.minimize(cost) + """ + + def __init__(self, learning_rate=0.001, window=1): + # todo(guru4elephant): add more optimizers here as argument + # todo(guru4elephant): make learning_rate as a variable + self.learning_rate_ = learning_rate + self.window_ = window + self.type = "downpour" + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + DownpounSGD is a distributed optimizer so + that user can call minimize to generate backward + operators and optimization operators within minmize function + Args: + loss(Variable): loss variable defined by user + startup_program(Program): startup program that defined by user + parameter_list(str list): parameter names defined by users + no_grad_set(set): a set of variables that is defined by users + so that these variables do not need gradient computation + Returns: + [ps_param, worker_skipped_ops] + ps_param: parameter server protobuf desc + worker_skipped_ops: operator names that need + to be skipped during execution + """ + params_grads = sorted( + append_backward(loss, parameter_list, no_grad_set), + key=lambda x: x[0].name) + table_name = find_distributed_lookup_table(loss.block.program) + prefetch_slots = find_distributed_lookup_table_inputs( + loss.block.program, table_name) + prefetch_slots_emb = find_distributed_lookup_table_outputs( + loss.block.program, table_name) + server = DownpourServer() + # window is communication strategy + worker = DownpourWorker(self.window_) + # Todo(guru4elephant): support multiple tables definitions + # currently support one big sparse table + sparse_table_index = 0 + # currently merge all dense parameters into one dense table + dense_table_index = 1 + params = [] + grads = [] + for i in params_grads: + params.append(i[0]) + for i in params_grads: + grads.append(i[1]) + server.add_sparse_table(sparse_table_index, self.learning_rate_, + prefetch_slots, prefetch_slots_emb) + server.add_dense_table(dense_table_index, self.learning_rate_, params, + grads) + worker.add_sparse_table(sparse_table_index, self.learning_rate_, + prefetch_slots, prefetch_slots_emb) + worker.add_dense_table(dense_table_index, self.learning_rate_, params, + grads) + ps_param = pslib.PSParameter() + ps_param.server_param.CopyFrom(server.get_desc()) + ps_param.trainer_param.CopyFrom(worker.get_desc()) + # Todo(guru4elephant): figure out how to support more sparse parameters + # currently only support lookup_table + worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + ps_param.trainer_param.skip_op.extend(worker_skipped_ops) + return [ps_param, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py new file mode 100644 index 0000000000000000000000000000000000000000..06d3d0315cf2932847b79ea799fc592692383287 --- /dev/null +++ b/python/paddle/fluid/distributed/helper.py @@ -0,0 +1,85 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class FileSystem(object): + """ + A file system that support async_executor hadoop client desc. + + Args: + fs_type (string): fs_type, for example is "afs" + user (string): hadoop param + passwd (string): hadoop param + hadoop bin (string): hadoop param + Examples: + fs = FileSystm() + """ + + def __init__(self, + fs_type="afs", + uri="afs://xx", + user=None, + passwd=None, + hadoop_bin=""): + assert user != None + assert passwd != None + assert hadoop_bin != None + import ps_pb2 as pslib + self.fs_client = pslib.FsClientParameter() + self.fs_client.uri = uri + self.fs_client.user = user + self.fs_client.passwd = passwd + #self.fs_client.buffer_size = 0 + self.fs_client.hadoop_bin = hadoop_bin + #self.fs_client.afs_conf = afs_conf if not afs_conf else "" + + def get_desc(self): + """ + get hadoop desc. + """ + return self.fs_client + + +class MPIHelper(object): + """ + MPIHelper is a wrapper of mpi4py, support get_rank get_size etc. + Args: + No params + Examples: + mh = MPIHelper() + mh.get_ip() + """ + + def __init__(self): + from mpi4py import MPI + self.comm = MPI.COMM_WORLD + self.MPI = MPI + + def get_rank(self): + return self.comm.Get_rank() + + def get_size(self): + return self.comm.Get_size() + + def get_ip(self): + import socket + local_ip = socket.gethostbyname(socket.gethostname()) + return local_ip + + def get_hostname(self): + import socket + return socket.gethostname() + + def finalize(self): + self.MPI.Finalize() diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py new file mode 100644 index 0000000000000000000000000000000000000000..41e0d64e0b788b0e354f7635c3d3e52d6bba7e23 --- /dev/null +++ b/python/paddle/fluid/distributed/node.py @@ -0,0 +1,179 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import ps_pb2 as pslib + + +class Server(object): + """ + A Server basic class. + """ + + def __init__(self): + pass + + +class Worker(object): + """ + A Worker basic class. + """ + + def __init__(self): + pass + + +class DownpourServer(Server): + """ + DownpourServer class is used to generate server program_desc + Args: + server: it is pslib.ServerParameter() + Examples: + server = DownpourServer() + """ + + def __init__(self): + self.server_ = pslib.ServerParameter() + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" + self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" + self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_thread_num = 12 + + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ + table = self.server_.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.table_class = "DownpourSparseTable" + table.type = pslib.PS_SPARSE_TABLE + table.accessor.accessor_class = "DownpourFeatureValueAccessor" + table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.sparse_sgd_param.initial_g2sum = 3 + table.accessor.sparse_sgd_param.initial_range = 1e-4 + table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10]) + + table.accessor.embedx_dim = 8 + table.accessor.embedx_threshold = 5 + table.accessor.fea_dim = 11 + table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 + table.accessor.downpour_accessor_param.click_coeff = 2 + table.accessor.downpour_accessor_param.base_threshold = 0.2 + table.accessor.downpour_accessor_param.delta_threshold = 0.15 + table.accessor.downpour_accessor_param.delta_keep_days = 31 + table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999 + table.accessor.downpour_accessor_param.delete_threshold = 0.8 + + def add_dense_table(self, table_id, learning_rate, param_var, grad_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ + table = self.server_.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.table_class = "DownpourDenseTable" + table.type = pslib.PS_DENSE_TABLE + table.accessor.accessor_class = "DownpourDenseValueAccessor" + table.accessor.dense_sgd_param.name = "adam" + table.accessor.dense_sgd_param.adam.learning_rate = learning_rate + table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 + table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 + table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8 + table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 + table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 + fea_dim = 0 + for param in filter(lambda x: x.name.find("embedding") == -1, + param_var): + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + table.accessor.fea_dim = fea_dim + + def get_desc(self): + """ + Return downpour server program_desc + """ + return self.server_ + + +class DownpourWorker(Worker): + """ + DownpourWorker class is used to generate worker program_desc + Args: + window (int): push params frequency + worker: it is pslib.DownpourTrainerParameter + Examples: + worker = DownpourWorker(1) + """ + + def __init__(self, window): + self.window = window + self.worker_ = pslib.DownpourTrainerParameter() + + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ + table = self.worker_.sparse_table.add() + table.table_id = table_id + table.slot_key.extend([var.name for var in slot_key_vars]) + table.slot_value.extend([var.name for var in slot_value_vars]) + table.slot_gradient.extend( + [var.name + "@GRAD" for var in slot_value_vars]) + + def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ + table = self.worker_.dense_table.add() + table.table_id = table_id + table.dense_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [p.name for p in param_vars])) + table.dense_gradient_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [g.name for g in grad_vars])) + + def get_desc(self): + """ + Return downpour worker program_desc + """ + return self.worker_ diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..d3ce3ce6934d08eb06763fea071a83e460c6bf6c --- /dev/null +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -0,0 +1,148 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from .helper import MPIHelper + + +class PaddlePSInstance(object): + """ + PaddlePSInstance class is used to generate A instance of server or worker + Args: + server_worker_mode: is a value 0 or 1, default is 1 + proc_per_node: process per node, default is 2 + Examples: + instance = PaddlePSInstance(1, 2) + """ + + def __init__(self, server_worker_mode, proc_per_node): + self.dh = MPIHelper() + self._rankid = self.dh.get_rank() + self._server_worker_mode = server_worker_mode + self._proc_per_node = proc_per_node + self._nodes = self.dh.get_size() + + self._ip = 0 + self._worker_num = self._nodes * self._proc_per_node / 2 + self._server_num = self._nodes * self._proc_per_node / 2 + self._total_server_worker = self._worker_num + self._server_num + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._set_nodetype() + self._comm = None + self._split_comm() + + def _set_nodetype(self): + if self._server_worker_mode == 0: + if self._rankid < self._server_num: + self._node_type = 1 + elif self._rankid < self._total_server_worker: + self._node_type = 0 + else: + self._node_type = -1 + elif self._server_worker_mode == 1: + if self._rankid < self._total_server_worker: + if 0 == self._rankid % self._proc_per_node % 2: + self._node_type = 0 + else: + self._node_type = 1 + else: + self._node_type = -1 + else: + self._node_type = -1 + + def _split_comm(self): + if self.is_server(): + self._comm = self.dh.comm.Split(self._node_type) + elif self.is_worker(): + self._comm = self.dh.comm.Split(self._node_type) + pass + + def get_worker_index(self): + """ + Return worker index + """ + if self._server_worker_mode == 0: + return self._rankid == self.server_num + else: + return self._rankid / self._proc_per_node + + def get_server_index(self): + """ + Return server index + """ + if self._server_worker_mode == 0: + return self.rank_id + else: + return self.rank_id / self._proc_per_node + + def is_worker(self): + """ + Return instance is worker or not + """ + return self._node_type == 1 + + def is_server(self): + """ + Return instance is server or not + """ + return self._node_type == 0 + + def is_first_worker(self): + """ + Return instance is first worker or not + """ + return self.is_worker() and 0 == self.get_worker_index() + + def set_ip(self, ip): + """ + set server ip + """ + self._ip = ip + + def gather_ips(self): + """ + Return all servers and workers ip throught mpi allgather + """ + self._ips = self.dh.comm.allgather(self._ip) + return self._ips + + def get_node_cnt(self): + """ + Return node cnt + """ + return self._nodes + + def barrier_all(self): + """ + barrier workers and servers + """ + self.dh.comm.barrier() + + def barrier_worker(self): + """ + barrier workers + """ + if self.is_worker(): + self._comm.barrier() + pass + + def finalize(self): + """ + MPI finalize + """ + self.dh.finalize() + pass + + +if __name__ == "__main__": + instance = PaddlePSInstance(1, 1, 2, 50) + instance.barrier_all() diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..0d226c4d593473681658fa3e7764d438a65b7116 --- /dev/null +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -0,0 +1,2296 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: ps.proto + +import sys +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + +DESCRIPTOR = _descriptor.FileDescriptor( + name='ps.proto', + package='paddle', + syntax='proto2', + serialized_pb=_b( + '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01' + )) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +_TABLETYPE = _descriptor.EnumDescriptor( + name='TableType', + full_name='paddle.TableType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3286, + serialized_end=3338, ) +_sym_db.RegisterEnumDescriptor(_TABLETYPE) + +TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) +_PSCMDID = _descriptor.EnumDescriptor( + name='PsCmdID', + full_name='paddle.PsCmdID', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_PULL_DENSE_TABLE', + index=0, + number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_TABLE', + index=1, + number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PULL_SPARSE_TABLE', + index=2, + number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_SPARSE_TABLE', + index=3, + number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ONE_TABLE', + index=5, + number=5, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ALL_TABLE', + index=6, + number=6, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ONE_TABLE', + index=7, + number=7, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ALL_TABLE', + index=8, + number=8, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ONE_TABLE', + index=9, + number=9, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ALL_TABLE', + index=10, + number=10, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_PARAM', + index=11, + number=11, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_STOP_SERVER', index=12, number=12, options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3341, + serialized_end=3658, ) +_sym_db.RegisterEnumDescriptor(_PSCMDID) + +PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) +PS_SPARSE_TABLE = 0 +PS_DENSE_TABLE = 1 +PS_PULL_DENSE_TABLE = 0 +PS_PUSH_DENSE_TABLE = 1 +PS_PULL_SPARSE_TABLE = 2 +PS_PUSH_SPARSE_TABLE = 3 +PS_SHRINK_TABLE = 4 +PS_SAVE_ONE_TABLE = 5 +PS_SAVE_ALL_TABLE = 6 +PS_LOAD_ONE_TABLE = 7 +PS_LOAD_ALL_TABLE = 8 +PS_CLEAR_ONE_TABLE = 9 +PS_CLEAR_ALL_TABLE = 10 +PS_PUSH_DENSE_PARAM = 11 +PS_STOP_SERVER = 12 + +_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( + name='FsApiType', + full_name='paddle.FsClientParameter.FsApiType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='HDFS', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='AFS', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3254, + serialized_end=3284, ) +_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) + +_PSPARAMETER = _descriptor.Descriptor( + name='PSParameter', + full_name='paddle.PSParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='worker_class', + full_name='paddle.PSParameter.worker_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.PSParameter.server_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='instance_class', + full_name='paddle.PSParameter.instance_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='worker_param', + full_name='paddle.PSParameter.worker_param', + index=3, + number=101, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_param', + full_name='paddle.PSParameter.server_param', + index=4, + number=102, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='trainer_param', + full_name='paddle.PSParameter.trainer_param', + index=5, + number=301, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', + full_name='paddle.PSParameter.fs_client_param', + index=6, + number=501, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=21, + serialized_end=307, ) + +_WORKERPARAMETER = _descriptor.Descriptor( + name='WorkerParameter', + full_name='paddle.WorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_worker_param', + full_name='paddle.WorkerParameter.downpour_worker_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=309, + serialized_end=390, ) + +_SERVERPARAMETER = _descriptor.Descriptor( + name='ServerParameter', + full_name='paddle.ServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_server_param', + full_name='paddle.ServerParameter.downpour_server_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=392, + serialized_end=473, ) + +_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( + name='DownpourWorkerParameter', + full_name='paddle.DownpourWorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourWorkerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=475, + serialized_end=554, ) + +_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( + name='DownpourTrainerParameter', + full_name='paddle.DownpourTrainerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='dense_table', + full_name='paddle.DownpourTrainerParameter.dense_table', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_table', + full_name='paddle.DownpourTrainerParameter.sparse_table', + index=1, + number=2, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_sparse_per_batch', + full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', + index=2, + number=3, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_per_batch', + full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='skip_op', + full_name='paddle.DownpourTrainerParameter.skip_op', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=557, + serialized_end=763, ) + +_DENSETABLEPARAMETER = _descriptor.Descriptor( + name='DenseTableParameter', + full_name='paddle.DenseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.DenseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_variable_name', + full_name='paddle.DenseTableParameter.dense_variable_name', + index=1, + number=2, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_gradient_variable_name', + full_name='paddle.DenseTableParameter.dense_gradient_variable_name', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.DenseTableParameter.fea_dim', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=765, + serialized_end=888, ) + +_SPARSETABLEPARAMETER = _descriptor.Descriptor( + name='SparseTableParameter', + full_name='paddle.SparseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.SparseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='feature_dim', + full_name='paddle.SparseTableParameter.feature_dim', + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_key', + full_name='paddle.SparseTableParameter.slot_key', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_value', + full_name='paddle.SparseTableParameter.slot_value', + index=3, + number=4, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_gradient', + full_name='paddle.SparseTableParameter.slot_gradient', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=890, + serialized_end=1012, ) + +_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( + name='DownpourServerParameter', + full_name='paddle.DownpourServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourServerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_param', + full_name='paddle.DownpourServerParameter.service_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1015, + serialized_end=1149, ) + +_SERVERSERVICEPARAMETER = _descriptor.Descriptor( + name='ServerServiceParameter', + full_name='paddle.ServerServiceParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.ServerServiceParameter.server_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsServer").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_class', + full_name='paddle.ServerServiceParameter.client_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsClient").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_class', + full_name='paddle.ServerServiceParameter.service_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourPsService").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='start_server_port', + full_name='paddle.ServerServiceParameter.start_server_port', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_thread_num', + full_name='paddle.ServerServiceParameter.server_thread_num', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=12, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1152, + serialized_end=1367, ) + +_TABLEPARAMETER = _descriptor.Descriptor( + name='TableParameter', + full_name='paddle.TableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.TableParameter.table_id', + index=0, + number=1, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_class', + full_name='paddle.TableParameter.table_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='shared_num', + full_name='paddle.TableParameter.shared_num', + index=2, + number=3, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='accessor', + full_name='paddle.TableParameter.accessor', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='type', + full_name='paddle.TableParameter.type', + index=4, + number=5, + type=14, + cpp_type=8, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='compress_in_save', + full_name='paddle.TableParameter.compress_in_save', + index=5, + number=6, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1370, + serialized_end=1561, ) + +_TABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='TableAccessorParameter', + full_name='paddle.TableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='accessor_class', + full_name='paddle.TableAccessorParameter.accessor_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_sgd_param', + full_name='paddle.TableAccessorParameter.sparse_sgd_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_sgd_param', + full_name='paddle.TableAccessorParameter.dense_sgd_param', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.TableAccessorParameter.fea_dim', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_dim', + full_name='paddle.TableAccessorParameter.embedx_dim', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_threshold', + full_name='paddle.TableAccessorParameter.embedx_threshold', + index=5, + number=6, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='downpour_accessor_param', + full_name='paddle.TableAccessorParameter.downpour_accessor_param', + index=6, + number=7, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_accessor_save_param', + full_name='paddle.TableAccessorParameter.table_accessor_save_param', + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1564, + serialized_end=1933, ) + +_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='DownpourTableAccessorParameter', + full_name='paddle.DownpourTableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='nonclk_coeff', + full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', + index=0, + number=1, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='click_coeff', + full_name='paddle.DownpourTableAccessorParameter.click_coeff', + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='base_threshold', + full_name='paddle.DownpourTableAccessorParameter.base_threshold', + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_threshold', + full_name='paddle.DownpourTableAccessorParameter.delta_threshold', + index=3, + number=4, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_keep_days', + full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', + index=4, + number=5, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='show_click_decay_rate', + full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', + index=5, + number=6, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delete_threshold', + full_name='paddle.DownpourTableAccessorParameter.delete_threshold', + index=6, + number=7, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1936, + serialized_end=2142, ) + +_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( + name='TableAccessorSaveParameter', + full_name='paddle.TableAccessorSaveParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='param', + full_name='paddle.TableAccessorSaveParameter.param', + index=0, + number=1, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='converter', + full_name='paddle.TableAccessorSaveParameter.converter', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='deconverter', + full_name='paddle.TableAccessorSaveParameter.deconverter', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2144, + serialized_end=2227, ) + +_PSREQUESTMESSAGE = _descriptor.Descriptor( + name='PsRequestMessage', + full_name='paddle.PsRequestMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cmd_id', + full_name='paddle.PsRequestMessage.cmd_id', + index=0, + number=1, + type=13, + cpp_type=3, + label=2, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.PsRequestMessage.table_id', + index=1, + number=2, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='params', + full_name='paddle.PsRequestMessage.params', + index=2, + number=3, + type=12, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_id', + full_name='paddle.PsRequestMessage.client_id', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsRequestMessage.data', + index=4, + number=5, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2229, + serialized_end=2330, ) + +_SPARSESGDRULEPARAMETER = _descriptor.Descriptor( + name='SparseSGDRuleParameter', + full_name='paddle.SparseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.SparseSGDRuleParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_g2sum', + full_name='paddle.SparseSGDRuleParameter.initial_g2sum', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_range', + full_name='paddle.SparseSGDRuleParameter.initial_range', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_bounds', + full_name='paddle.SparseSGDRuleParameter.weight_bounds', + index=3, + number=4, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2332, + serialized_end=2451, ) + +_DENSESGDRULEPARAMETER = _descriptor.Descriptor( + name='DenseSGDRuleParameter', + full_name='paddle.DenseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', + full_name='paddle.DenseSGDRuleParameter.name', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='adam', + full_name='paddle.DenseSGDRuleParameter.adam', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='naive', + full_name='paddle.DenseSGDRuleParameter.naive', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='summary', + full_name='paddle.DenseSGDRuleParameter.summary', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='moving_average', + full_name='paddle.DenseSGDRuleParameter.moving_average', + index=4, + number=5, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2454, + serialized_end=2679, ) + +_ADAMSGDPARAMETER = _descriptor.Descriptor( + name='AdamSGDParameter', + full_name='paddle.AdamSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.AdamSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.AdamSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_decay_rate', + full_name='paddle.AdamSGDParameter.ada_decay_rate', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_epsilon', + full_name='paddle.AdamSGDParameter.ada_epsilon', + index=3, + number=4, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mom_decay_rate', + full_name='paddle.AdamSGDParameter.mom_decay_rate', + index=4, + number=5, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2682, + serialized_end=2816, ) + +_NAIVESGDPARAMETER = _descriptor.Descriptor( + name='NaiveSGDParameter', + full_name='paddle.NaiveSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.NaiveSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.NaiveSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2818, + serialized_end=2884, ) + +_SUMMARYSGDPARAMETER = _descriptor.Descriptor( + name='SummarySGDParameter', + full_name='paddle.SummarySGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='summary_decay_rate', + full_name='paddle.SummarySGDParameter.summary_decay_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0.999999), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2886, + serialized_end=2945, ) + +_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( + name='MovingAverageRuleParameter', + full_name='paddle.MovingAverageRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='momentum', + full_name='paddle.MovingAverageRuleParameter.momentum', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2947, + serialized_end=2993, ) + +_PSRESPONSEMESSAGE = _descriptor.Descriptor( + name='PsResponseMessage', + full_name='paddle.PsResponseMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='err_code', + full_name='paddle.PsResponseMessage.err_code', + index=0, + number=1, + type=5, + cpp_type=1, + label=2, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='err_msg', + full_name='paddle.PsResponseMessage.err_msg', + index=1, + number=2, + type=9, + cpp_type=9, + label=2, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsResponseMessage.data', + index=2, + number=3, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2995, + serialized_end=3068, ) + +_FSCLIENTPARAMETER = _descriptor.Descriptor( + name='FsClientParameter', + full_name='paddle.FsClientParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fs_type', + full_name='paddle.FsClientParameter.fs_type', + index=0, + number=1, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uri', + full_name='paddle.FsClientParameter.uri', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='user', + full_name='paddle.FsClientParameter.user', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='passwd', + full_name='paddle.FsClientParameter.passwd', + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='buffer_size', + full_name='paddle.FsClientParameter.buffer_size', + index=4, + number=5, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='hadoop_bin', + full_name='paddle.FsClientParameter.hadoop_bin', + index=5, + number=51, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='afs_conf', + full_name='paddle.FsClientParameter.afs_conf', + index=6, + number=101, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3071, + serialized_end=3284, ) + +_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER +_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER +_PSPARAMETER.fields_by_name[ + 'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER +_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER +_WORKERPARAMETER.fields_by_name[ + 'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER +_SERVERPARAMETER.fields_by_name[ + 'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER +_DOWNPOURWORKERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'dense_table'].message_type = _DENSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'service_param'].message_type = _SERVERSERVICEPARAMETER +_TABLEPARAMETER.fields_by_name[ + 'accessor'].message_type = _TABLEACCESSORPARAMETER +_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE +_TABLEACCESSORPARAMETER.fields_by_name[ + 'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name[ + 'summary'].message_type = _SUMMARYSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name[ + 'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER +_FSCLIENTPARAMETER.fields_by_name[ + 'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE +_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER +DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER +DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER +DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER +DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'ServerServiceParameter'] = _SERVERSERVICEPARAMETER +DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorParameter'] = _TABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER +DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE +DESCRIPTOR.message_types_by_name[ + 'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER +DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER +DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER +DESCRIPTOR.message_types_by_name[ + 'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER +DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE +DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER +DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE +DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID + +PSParameter = _reflection.GeneratedProtocolMessageType( + 'PSParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_PSPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PSParameter) + )) +_sym_db.RegisterMessage(PSParameter) + +WorkerParameter = _reflection.GeneratedProtocolMessageType( + 'WorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_WORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) + )) +_sym_db.RegisterMessage(WorkerParameter) + +ServerParameter = _reflection.GeneratedProtocolMessageType( + 'ServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerParameter) + )) +_sym_db.RegisterMessage(ServerParameter) + +DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourWorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURWORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) + )) +_sym_db.RegisterMessage(DownpourWorkerParameter) + +DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTrainerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTRAINERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) + )) +_sym_db.RegisterMessage(DownpourTrainerParameter) + +DenseTableParameter = _reflection.GeneratedProtocolMessageType( + 'DenseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) + )) +_sym_db.RegisterMessage(DenseTableParameter) + +SparseTableParameter = _reflection.GeneratedProtocolMessageType( + 'SparseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) + )) +_sym_db.RegisterMessage(SparseTableParameter) + +DownpourServerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURSERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) + )) +_sym_db.RegisterMessage(DownpourServerParameter) + +ServerServiceParameter = _reflection.GeneratedProtocolMessageType( + 'ServerServiceParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERSERVICEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) + )) +_sym_db.RegisterMessage(ServerServiceParameter) + +TableParameter = _reflection.GeneratedProtocolMessageType( + 'TableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableParameter) + )) +_sym_db.RegisterMessage(TableParameter) + +TableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) + )) +_sym_db.RegisterMessage(TableAccessorParameter) + +DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) + )) +_sym_db.RegisterMessage(DownpourTableAccessorParameter) + +TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorSaveParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) + )) +_sym_db.RegisterMessage(TableAccessorSaveParameter) + +PsRequestMessage = _reflection.GeneratedProtocolMessageType( + 'PsRequestMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSREQUESTMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) + )) +_sym_db.RegisterMessage(PsRequestMessage) + +SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'SparseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) + )) +_sym_db.RegisterMessage(SparseSGDRuleParameter) + +DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'DenseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) + )) +_sym_db.RegisterMessage(DenseSGDRuleParameter) + +AdamSGDParameter = _reflection.GeneratedProtocolMessageType( + 'AdamSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_ADAMSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) + )) +_sym_db.RegisterMessage(AdamSGDParameter) + +NaiveSGDParameter = _reflection.GeneratedProtocolMessageType( + 'NaiveSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_NAIVESGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) + )) +_sym_db.RegisterMessage(NaiveSGDParameter) + +SummarySGDParameter = _reflection.GeneratedProtocolMessageType( + 'SummarySGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SUMMARYSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) + )) +_sym_db.RegisterMessage(SummarySGDParameter) + +MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType( + 'MovingAverageRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) + )) +_sym_db.RegisterMessage(MovingAverageRuleParameter) + +PsResponseMessage = _reflection.GeneratedProtocolMessageType( + 'PsResponseMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSRESPONSEMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) + )) +_sym_db.RegisterMessage(PsResponseMessage) + +FsClientParameter = _reflection.GeneratedProtocolMessageType( + 'FsClientParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_FSCLIENTPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) + )) +_sym_db.RegisterMessage(FsClientParameter) + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), + _b('\200\001\001')) +# @@protoc_insertion_point(module_scope) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9e6cd1a0ab54d9fd2764b787416e4f5c86755a68..4d8311a0d3ada78e4f6cc54f8990e2a2e2cadc4d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -176,6 +176,7 @@ __all__ = [ 'get_tensor_from_selected_rows', 'lstm', 'psroi_pool', + 'huber_loss', ] kIgnoreIndex = -100 @@ -497,7 +498,7 @@ def lstm(input, If Device is GPU, This op will use cudnn LSTM implementation A four-gate Long Short-Term Memory network with no peephole connections. - In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ @@ -524,19 +525,19 @@ def lstm(input, - $\tilde{c_t}$ is also called candidate hidden state, which is computed based on the current input and the previous hidden state. - Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, + Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, X represensts a matrix multiplication Args: input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) - init_h(Variable): The initial hidden state of the LSTM + init_h(Variable): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) init_c(Variable): The initial cell state of the LSTM. This is a tensor with shape ( num_layers x batch_size x hidden_size ) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len hidden_size (int): hidden size of the LSTM num_layers (int): total layers number of the LSTM dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps @@ -555,10 +556,10 @@ def lstm(input, if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) last_h(Tensor): the hidden state of the last step of LSTM shape is ( num_layers x batch_size x hidden_size ) - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) last_c(Tensor): the cell state of the last step of LSTM shape is ( num_layers x batch_size x hidden_size ) - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) Examples: @@ -4658,7 +4659,7 @@ def ctc_greedy_decoder(input, blank, name=None): [0.5, 0.1, 0.3, 0.1]] input.lod = [[4, 4]] - + Computation: step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get: @@ -4691,7 +4692,7 @@ def ctc_greedy_decoder(input, blank, name=None): Returns: Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. 'Lp' is the sum if all output sequences' length. If all the sequences - in result were empty, the result LoDTensor will be [-1] with + in result were empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1]. Examples: @@ -5045,7 +5046,7 @@ def hsigmoid(input, """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a - complete binary tree, or you can use is_custom to pass your own tree to + complete binary tree, or you can use is_custom to pass your own tree to implement hierarchical. Each leaf node represents a class(a word) and each internal node acts as a binary classifier. For each word there's a unique path from root to it's leaf node, hsigmoid calculate the cost for each @@ -5061,7 +5062,7 @@ def hsigmoid(input, 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code means label of each binary classification, using 1 indicate true, 0 indicate false. - 4. now, each word should has its path and code along the path, you can pass a batch of path and code + 4. now, each word should has its path and code along the path, you can pass a batch of path and code related to the same batch of inputs. @@ -5071,8 +5072,8 @@ def hsigmoid(input, and :math:`D` is the feature size. label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. - num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, - it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num + num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, + it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num which indicates the num of classes using by binary classify. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid @@ -5085,15 +5086,15 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - path_table: (Variable|None) this variable can store each batch of samples' path to root, + path_table: (Variable|None) this variable can store each batch of samples' path to root, it should be in leaf -> root order - path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like - structure and each element in this array is indexes in parent nodes' Weight Matrix. - path_code: (Variable|None) this variable can store each batch of samples' code, + path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + path_code: (Variable|None) this variable can store each batch of samples' code, each code consist with every code of parent nodes. it should be in leaf -> root order - is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is + is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is set you need to set path_table/path_code/num_classes, otherwise num_classes should be set - is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient + is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient of W and input will be sparse. Returns: @@ -9377,3 +9378,51 @@ def psroi_pool(input, 'pooled_width': pooled_width }) return out + + +def huber_loss(input, label, delta): + """ + Huber loss is a loss function used in robust. + Huber loss can evaluate the fitness of input to label. + Different from MSE loss, Huber loss is more robust for outliers. + + When the difference between input and label is large than delta + .. math:: + + huber\_loss = delta * (label - input) - 0.5 * delta * delta + + When the difference between input and label is less than delta + .. math:: + + huber\_loss = 0.5 * (label - input) * (label - input) + + + Args: + input (Variable): This input is a probability computed by the previous operator. + The first dimension is batch size, and the last dimension is 1. + label (Variable): The groud truth whose first dimension is batch size + and last dimension is 1. + delta (float): The parameter of huber loss, which controls + the range of outliers + + Returns: + huber\_loss (Variable): The huber loss with shape [batch_size, 1]. + + Examples: + .. code-block:: python + + predictions = fluid.layers.softmax(x) + loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0) + """ + helper = LayerHelper('huber_loss', **locals()) + residual = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + helper.append_op( + type='huber_loss', + inputs={'X': input, + 'Y': label}, + outputs={'Out': out, + 'Residual': residual}, + attrs={'delta': delta}) + return out diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674c26fe61e624103c2a0d70f816a1aebc..e2a9fc183ea9206efd892b23844081cb9d2fb3d3 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, + use_ir_memory_optimize=False, fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, @@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index e91cfe0b45ab7e4e56fccf8d49eb381fbbd199d1..89476ee641f1dd295a3caca89ac41038cad317f2 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = network(data, label, len(word_dict)) + cost.persistable = True optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) optimizer.minimize(cost) diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..6ca65c5d3b689612f6624a7e0e16c4dabbae1738 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def _feed_data_helper(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + return img, label + + +def simple_fc_net(use_feed): + x, y = _feed_data_helper(use_feed) + hidden_layer = 4 + for _ in range(hidden_layer): + x = fluid.layers.fc(input=x, size=20, act='relu') + y_predict = fluid.layers.fc(input=x, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +def fc_with_inplace_net(use_feed): + x, y = _feed_data_helper(use_feed) + fc = fluid.layers.fc(input=x, size=20, act='relu') + fc = fluid.layers.fc(input=fc, size=10, act='relu') + reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5]) + reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def _dummy_data(self): + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_ir_and_python_memory_optimize(self, model, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + + img, label = self._dummy_data() + first_loss0, last_loss0 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=False) + first_loss1, last_loss1 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=True) + for loss in zip(first_loss0, first_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + for loss in zip(last_loss0, last_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_net(self): + self._compare_ir_and_python_memory_optimize(simple_fc_net, False) + self._compare_ir_and_python_memory_optimize(simple_fc_net, True) + + def test_fc_with_reshape_net(self): + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False) + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 95aafec05361a8b66b849268c7a738bb2ee5da86..7b530ba617498c79edc52ef177990410daf6b2a4 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -43,6 +43,7 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), ("conditional_block", "conditional_block_grad")] PRINT_LOG = False +FLAGS_memory_optimize = "" class OrderedSet(MutableSet): @@ -121,6 +122,7 @@ class ControlFlowGraph(object): self._defs = defaultdict(OrderedSet) self._live_in = defaultdict(OrderedSet) self._live_out = defaultdict(OrderedSet) + self._skip_opt = skip_opt self.pool = [] @@ -144,7 +146,6 @@ class ControlFlowGraph(object): for i in range(self.op_size): self._uses[i].update(self._ops[i].input_arg_names()) self._defs[i].update(self._ops[i].output_arg_names()) - self._live_in[i] = self._uses[i] def _update_graph(self, old_name, new_name, begin_idx=0): for i in range(begin_idx, self.op_size): @@ -177,20 +178,52 @@ class ControlFlowGraph(object): worklist.append(d) def _fill_pool(self, i, is_forward): + def comparator(x, cache): + x_shape = x[1] + cache_shape = cache[1] + x_size = abs(reduce(lambda x, y: x * y, x_shape)) + cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) + if (x_shape[0] == -1 and cache_shape[0] == -1) or \ + (x_shape[0] != -1 and cache_shape[0] != -1) : + return x_size <= cache_size + else: + return False + + def find_var_in_block(x): + known_vars = set() + for op in self._ops: + known_vars.update(op.output_arg_names()) + return x in known_vars + block_desc = self._ops[i].block() in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in in_diff + x for x in sorted(in_diff) if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: for var_name in can_optimize: cache = (var_name, self._find_var(block_desc, var_name, is_forward).shape()) - if cache not in self.pool: - self.pool.append(cache) + if cache not in self.pool and find_var_in_block(var_name): + i = 0 + while i < len(self.pool): + mycache = self.pool[i] + mysize = mycache[1][0] + cache_size = cache[1][0] + if (mysize == -1 and cache_size == -1) or \ + (mysize != -1 and cache_size != -1): + if comparator(mycache, cache): + i += 1 + else: + break + elif mysize == -1 and cache_size != -1: + i += 1 + elif mysize != -1 and cache_size == -1: + break + self.pool.insert(i, cache) def _get_diff(self, a, b): u = a & b @@ -229,7 +262,7 @@ class ControlFlowGraph(object): def _update_skip_opt_set(self): for i in range(self.op_size): op = self._ops[i] - if op.type() == "fill_constant" and op.attr("force_cpu") == True: + if op.has_attr("force_cpu") and op.attr("force_cpu") == True: self._skip_opt.update(op.output_arg_names()) def release_memory(self, skip_opt_set=None): @@ -281,6 +314,7 @@ class ControlFlowGraph(object): # update skip set to meet users' demand if skip_opt_set: self._skip_opt.update(skip_opt_set) + counter = 0 for i in range(self.op_size): op = self._ops[i] if op.type() in SUB_BLOCK_OPS: @@ -301,6 +335,9 @@ class ControlFlowGraph(object): # If x is both in uses and defs, it can not be optimized! if x in self._uses[i]: continue + if x == FLAGS_memory_optimize: + print("start match var ", x, " of op ", op.type()) + print(self.pool) for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] @@ -323,15 +360,13 @@ class ControlFlowGraph(object): if not compare_shape(x_shape, cache_shape, level): continue # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype] - if x_dtype != cache_dtype: - continue - if PRINT_LOG: - print(("Hit Cache !!!! cache pool index " - "is %d, var name is %s, " - "cached var name is %s, " - "var shape is %s ") % (index, x, cache_var, - str(cache_shape))) + print( + ("!!! %d, %s => %s, cache idx %d, pool size %d" + % (counter, x + str(x_shape), + cache_var + str(cache_shape), index, + len(self.pool)))) + counter += 1 self.pool.pop(index) # Rename the var to the cache var already with # memory allocated in order to reuse the memory. @@ -484,8 +519,11 @@ def memory_optimize(input_program, if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") - if skip_opt_set is not None and not isinstance(skip_opt_set, set): - raise ValueError("only support skip_opt_set as set.") + if skip_opt_set is not None: + if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list): + skip_opt_set = set(skip_opt_set) + else: + raise ValueError("only support skip_opt_set as set.") global PRINT_LOG PRINT_LOG = print_log if skip_grads: diff --git a/python/setup.py.in b/python/setup.py.in index 0eb69cdb5c7d140527dba7a648728750bfb404f7..cf8f28bd250d82c18b296a4d01c3a9856d801b8c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -104,8 +104,10 @@ packages=['paddle', 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', + 'paddle.fluid.distributed', 'paddle.fluid.layers', 'paddle.fluid.contrib', + 'paddle.fluid.contrib.utils', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', @@ -241,5 +243,6 @@ setup(name='${PACKAGE_NAME}', ext_modules=ext_modules, package_data=package_data, package_dir=package_dir, - scripts=paddle_bins + scripts=paddle_bins, + distclass=BinaryDistribution )