提交 3b7b2e1d 编写于 作者: J JiabinYang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_prefech_hs

......@@ -65,6 +65,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
......@@ -131,8 +132,6 @@ if (APPLE OR WIN32)
endif()
if (WIN32)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when compiling for Windows" FORCE)
set(WITH_DSO OFF CACHE STRING
"Disable DSO when compiling for Windows" FORCE)
set(WITH_MKL OFF CACHE STRING
......@@ -217,6 +216,12 @@ include(cupti)
include(external/gzstream)
endif (NOT WIN32)
if(WITH_PSLIB)
include(external/libmct)
include(external/pslib_brpc)
include(external/pslib)
endif(WITH_PSLIB)
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
include(external/grpc)
......@@ -284,6 +289,12 @@ set(EXTERNAL_LIBS
${PYTHON_LIBRARIES}
)
if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib)
list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB)
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
......
......@@ -84,6 +84,10 @@ if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB)
endif()
if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU)
......
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_LIBMCT})
return()
ENDIF(NOT ${WITH_LIBMCT})
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with LIBMCT in Paddle yet."
"Force WITH_LIBMCT=OFF")
SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(LIBMCT_PROJECT "extern_libmct")
IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
MESSAGE(STATUS "use pre defined download url")
SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct")
SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}")
SET(LIBMCT_DST_DIR "libmct")
SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR})
SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR})
FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(LIBMCT)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n"
" DESTINATION ${LIBMCT_DST_DIR})\n")
ExternalProject_Add(
${LIBMCT_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${LIBMCT_SOURCE_DIR}
DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz
&& tar zxvf ${LIBMCT_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(libmct STATIC ${dummyfile})
else()
add_library(libmct INTERFACE)
endif()
#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
LIST(APPEND external_project_dependencies libmct)
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_PSLIB})
return()
ENDIF(NOT ${WITH_PSLIB})
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with PSLIB in Paddle yet."
"Force WITH_PSLIB=OFF")
SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(PSLIB_PROJECT "extern_pslib")
IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
MESSAGE(STATUS "use pre defined download url")
SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE)
SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE)
SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib")
SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}")
SET(PSLIB_DST_DIR "pslib")
SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR})
SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR})
SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include)
SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib)
SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so)
SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib")
INCLUDE_DIRECTORIES(${PSLIB_INC_DIR})
FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(PSLIB)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n"
" DESTINATION ${PSLIB_DST_DIR})\n")
ExternalProject_Add(
${PSLIB_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${PSLIB_SOURCE_DIR}
DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
&& tar zxvf ${PSLIB_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
)
ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
LIST(APPEND external_project_dependencies pslib)
IF(WITH_C_API)
INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
ENDIF()
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_PSLIB_BRPC})
return()
ENDIF(NOT ${WITH_PSLIB_BRPC})
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with PSLIB_BRPC in Paddle yet."
"Force WITH_PSLIB_BRPC=OFF")
SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc")
IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
MESSAGE(STATUS "use pre defined download url")
SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE)
SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE)
SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc")
SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}")
SET(PSLIB_BRPC_DST_DIR "pslib_brpc")
SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR})
SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR})
SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include)
SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib)
SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a)
SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib")
INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR})
FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(PSLIB_BRPC)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n"
" DESTINATION ${PSLIB_BRPC_DST_DIR})\n")
ExternalProject_Add(
${PSLIB_BRPC_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${PSLIB_BRPC_SOURCE_DIR}
DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz
&& tar zxvf ${PSLIB_BRPC_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT}
)
ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
LIST(APPEND external_project_dependencies pslib_brpc)
IF(WITH_C_API)
INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
ENDIF()
......@@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if (WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
else(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
endif (WIN32)
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
......@@ -56,6 +50,16 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
add_custom_command(TARGET extern_snappy POST_BUILD
COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
)
ENDIF()
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
else(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
endif (WIN32)
add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
......
......@@ -56,7 +56,12 @@ else()
endif()
if (WIN32)
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
add_custom_command(TARGET extern_xxhash POST_BUILD
COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib
)
ENDIF()
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
else()
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
endif ()
......
......@@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
IF(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
ELSE(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
......@@ -49,6 +43,16 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib")
add_custom_command(TARGET extern_zlib POST_BUILD
COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib
)
ENDIF()
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE)
ELSE(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
......
......@@ -32,24 +32,35 @@ function(copy TARGET)
list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst)
if (WIN32)
# windows cmd shell will not expand wildcard automatically.
# below expand the files,libs and copy them by rules.
file(GLOB header_files ${src} "*.h")
file(GLOB static_lib_files ${src} "*.lib")
file(GLOB dll_lib_files ${src} "*.dll")
set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
if (NOT "${src_files}" STREQUAL "")
list(REMOVE_DUPLICATES src_files)
endif ()
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
foreach (src_file ${src_files})
if(IS_DIRECTORY ${src})
get_filename_component(last_path ${src} NAME)
string(APPEND dst "/" ${last_path})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
COMMENT "copying ${src_file} -> ${dst}")
endforeach ()
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
if(EXISTS ${src})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND cmake -E copy_directory "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
else()
message(WARNING "${src} not exist!")
endif()
else()
# windows cmd shell will not expand wildcard automatically.
# below expand the files, and copy them by rules.
file(GLOB src_files ${src})
if (NOT "${src_files}" STREQUAL "")
list(REMOVE_DUPLICATES src_files)
endif ()
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
foreach (src_file ${src_files})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
COMMENT "copying ${src_file} -> ${dst}")
endforeach ()
endif()
else (WIN32) # not windows
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND mkdir -p "${dst}"
......@@ -95,7 +106,7 @@ copy(xxhash_lib
DEPS xxhash
)
if (NOT PROTOBUF_FOUND)
if (NOT PROTOBUF_FOUND OR WIN32)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
......@@ -138,27 +149,25 @@ if (WITH_NGRAPH)
)
endif ()
if (NOT WIN32)
if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy)
if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib)
endif ()
endif (NOT WIN32)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib)
endif ()
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
......@@ -191,9 +200,21 @@ if (WITH_ANAKIN AND WITH_MKL)
list(APPEND inference_deps anakin_inference_lib)
endif ()
if (TENSORRT_FOUND)
copy(tensorrt_lib DEPS ${inference_deps}
SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
endif ()
set(module "inference")
if(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
endif(WIN32)
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib}
${src_dir}/${module}/api/paddle_*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
......@@ -233,7 +254,7 @@ copy(third_party DEPS fluid_lib_dist
# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
copy(inference_api_lib DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
SRCS ${paddle_fluid_lib}
${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
)
......
......@@ -37,8 +37,16 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non
paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, ''))
paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12))
paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
......@@ -201,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
......@@ -367,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
#windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
set(multiValueArgs SRCS PATH)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif()
# only copy the xx.cu to .xx.cu when the content are modified
#only copy the xx.cu to.xx.cu when the content are modified
set(copy_flag 1)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
......@@ -32,7 +33,7 @@ endfunction()
add_subdirectory(ir)
add_subdirectory(details)
# ddim lib
#ddim lib
proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto)
......@@ -91,8 +92,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
if(WITH_GPU)
if (WIN32)
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
#windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file.
windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file)
......@@ -143,7 +144,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
#Generate an empty \
#__init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
if (NOT WIN32)
......@@ -195,7 +197,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
graph build_strategy
fast_threaded_ssa_graph_executor variable_helper)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
if(WITH_PSLIB)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
else()
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
endif(WITH_PSLIB)
cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto)
......
......@@ -29,6 +29,9 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
#ifdef PADDLE_WITH_PSLIB
#include <pslib.h>
#endif
namespace paddle {
namespace framework {
......@@ -47,6 +50,11 @@ void AsyncExecutor::CreateThreads(
worker->SetDataFeed(reader);
worker->SetFetchVarNames(fetch_var_names);
worker->BindingDataFeedMemory();
#ifdef PADDLE_WITH_PSLIB
worker->SetPSlibPtr(_pslib_ptr);
worker->SetPullDenseThread(_pull_dense_thread);
worker->SetParamConfig(&_param_config);
#endif
}
void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers, // NOLINT
......@@ -60,12 +68,177 @@ void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers, // NOLINT
readers[0]->SetFileList(filelist);
}
#ifdef PADDLE_WITH_PSLIB
void AsyncExecutor::InitServer(const std::string& dist_desc, int index) {
_pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
new paddle::distributed::PSlib());
_pslib_ptr->init_server(dist_desc, index);
InitParamConfig();
}
void AsyncExecutor::InitWorker(const std::string& dist_desc,
const std::vector<uint64_t>& host_sign_list,
int node_num, int index) {
_pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
new paddle::distributed::PSlib());
_pslib_ptr->init_worker(
dist_desc, const_cast<uint64_t*>(host_sign_list.data()), node_num, index);
InitParamConfig();
}
uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); }
void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); }
void AsyncExecutor::GatherServers(const std::vector<uint64_t>& host_sign_list,
int node_num) {
_pslib_ptr->gather_servers(const_cast<uint64_t*>(host_sign_list.data()),
node_num);
}
void AsyncExecutor::InitParamConfig() {
for (int i = 0; i < _pslib_ptr->get_param()
->server_param()
.downpour_server_param()
.downpour_table_param_size();
++i) {
if (_pslib_ptr->get_param()
->server_param()
.downpour_server_param()
.downpour_table_param(i)
.table_class()
.find("SparseTable") != -1) {
_param_config.fea_dim = _pslib_ptr->get_param()
->server_param()
.downpour_server_param()
.downpour_table_param(i)
.accessor()
.fea_dim();
break;
}
}
_param_config.slot_dim = _param_config.fea_dim - 2;
_param_config.tmp_push_dense_wait_times = static_cast<int32_t>(
_pslib_ptr->get_param()->trainer_param().push_dense_per_batch());
_param_config.tmp_push_sparse_wait_times = static_cast<int32_t>(
_pslib_ptr->get_param()->trainer_param().push_sparse_per_batch());
for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
++t) {
_param_config.skip_op.push_back(
_pslib_ptr->get_param()->trainer_param().skip_op(t));
}
for (auto t = 0u;
t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) {
auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t);
std::vector<std::string> tmp_sparse_variable_name;
for (int i = 0u; i < table.slot_value_size(); ++i) {
tmp_sparse_variable_name.push_back(table.slot_value(i));
_param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id();
}
std::vector<std::string> tmp_sparse_gradient_variable_name;
for (auto i = 0u; i < table.slot_gradient_size(); ++i) {
tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i));
}
_param_config.slot_input_vec[table.table_id()] =
std::move(tmp_sparse_variable_name);
_param_config.gradient_var[table.table_id()] =
std::move(tmp_sparse_gradient_variable_name);
_param_config.sparse_table_id.push_back(table.table_id());
}
for (auto t = 0u;
t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) {
auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t);
std::vector<std::string> tmp_dense_variable_name;
for (int i = 0u; i < table.dense_variable_name_size(); ++i) {
tmp_dense_variable_name.push_back(table.dense_variable_name(i));
}
std::vector<std::string> tmp_dense_gradient_variable_name;
for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) {
tmp_dense_gradient_variable_name.push_back(
table.dense_gradient_variable_name(i));
}
_param_config.dense_variable_name[table.table_id()] =
std::move(tmp_dense_variable_name);
_param_config.dense_gradient_variable_name[table.table_id()] =
std::move(tmp_dense_gradient_variable_name);
_param_config.dense_table_id.push_back(table.table_id());
_param_config.dense_table_size.push_back(table.fea_dim());
}
}
void AsyncExecutor::InitModel() {
for (auto table_id : _param_config.dense_table_id) {
std::vector<paddle::ps::Region> regions;
for (auto& t : _param_config.dense_variable_name[table_id]) {
Variable* var = root_scope_->FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* g = tensor->data<float>();
CHECK(g != nullptr) << "var[" << t << "] value not initialized";
float init_range = 0.2;
int rown = tensor->dims()[0];
init_range /= sqrt(rown);
std::normal_distribution<float> ndistr(0.0, 1.0);
for (auto i = 0u; i < tensor->numel(); ++i) {
g[i] = ndistr(local_random_engine()) * init_range;
}
paddle::ps::Region reg(g, tensor->numel());
regions.emplace_back(std::move(reg));
}
auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(
regions.data(), regions.size(), table_id);
push_status.wait();
auto status = push_status.get();
if (status != 0) {
LOG(FATAL) << "push dense param failed, status[" << status << "]";
exit(-1);
}
}
}
void AsyncExecutor::SaveModel(const std::string& path) {
auto ret = _pslib_ptr->_worker_ptr->flush();
ret.wait();
ret = _pslib_ptr->_worker_ptr->save(path, 0);
ret.wait();
int32_t feasign_cnt = ret.get();
if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0
LOG(FATAL) << "save model failed";
exit(-1);
}
}
void AsyncExecutor::PrepareDenseThread(const std::string& mode) {
if (mode == "mpi") {
DensePullThreadParam param;
param.ps_client = _pslib_ptr->_worker_ptr;
param.threshold = 1;
param.training_thread_num = actual_thread_num;
param.root_scope = root_scope_;
param.dense_params = &_param_config.dense_variable_name;
_pull_dense_thread =
std::shared_ptr<DensePullThread>(new DensePullThread(param));
_pull_dense_thread->start();
}
}
#endif
void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
const std::string& data_feed_desc_str,
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_var_names,
const bool debug) {
const std::string& mode, const bool debug) {
std::vector<std::thread> threads;
auto& block = main_program.Block(0);
......@@ -82,7 +255,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
&data_feed_desc);
int actual_thread_num = thread_num;
actual_thread_num = thread_num;
int file_cnt = filelist.size();
PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
......@@ -106,11 +279,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
// todo: should be factory method for creating datafeed
std::vector<std::shared_ptr<DataFeed>> readers;
PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
#ifdef PADDLE_WITH_PSLIB
PrepareDenseThread(mode);
#endif
std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
workers.resize(actual_thread_num);
for (auto& worker : workers) {
#ifdef PADDLE_WITH_PSLIB
if (mode == "mpi") {
worker.reset(new AsyncExecutorThreadWorker);
} else {
worker.reset(new ExecutorThreadWorker);
}
#else
worker.reset(new ExecutorThreadWorker);
#endif
}
// prepare thread resource here
......@@ -128,7 +311,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
for (auto& th : threads) {
th.join();
}
#ifdef PADDLE_WITH_PSLIB
if (mode == "mpi") {
_pull_dense_thread->stop();
}
#endif
root_scope_->DropKids();
return;
......
......@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once
#include <time.h>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <mutex> // NOLINT
#include <random> // local_random_engine
#include <set>
#include <string>
#include <thread> // NOLINT
......@@ -30,6 +32,31 @@ limitations under the License. */
namespace paddle {
namespace framework {
inline double current_realtime() {
#if !defined(_WIN32)
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return tp.tv_sec + tp.tv_nsec * 1e-9;
#else
return 0.0;
#endif
}
inline std::default_random_engine& local_random_engine() {
struct engine_wrapper_t {
std::default_random_engine engine;
engine_wrapper_t() {
static std::atomic<uint64_t> x(0);
std::seed_seq sseq = {x++, x++, x++,
static_cast<uint64_t>(current_realtime() * 1000)};
engine.seed(sseq);
}
};
thread_local engine_wrapper_t r;
return r.engine;
}
class AsyncExecutor {
public:
AsyncExecutor(Scope* scope, const platform::Place& place);
......@@ -39,7 +66,19 @@ class AsyncExecutor {
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_names,
const bool debug = false);
const std::string& mode, const bool debug = false);
#ifdef PADDLE_WITH_PSLIB
void InitServer(const std::string& dist_desc, int index);
void InitWorker(const std::string& dist_desc,
const std::vector<uint64_t>& host_sign_list, int node_num,
int index);
uint64_t StartServer();
void StopServer();
void GatherServers(const std::vector<uint64_t>& host_sign_list, int node_num);
void InitModel();
void SaveModel(const std::string& path);
void InitParamConfig();
#endif
private:
void CreateThreads(ExecutorThreadWorker* worker,
......@@ -48,10 +87,21 @@ class AsyncExecutor {
const std::vector<std::string>& fetch_var_names,
Scope* root_scope, const int thread_index,
const bool debug);
#ifdef PADDLE_WITH_PSLIB
void PrepareDenseThread(const std::string& mode);
#endif
public:
#ifdef PADDLE_WITH_PSLIB
std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
std::shared_ptr<DensePullThread> _pull_dense_thread;
AsyncWorkerParamConfig _param_config;
#endif
Scope* root_scope_;
platform::Place place_;
private:
int actual_thread_num;
};
} // namespace framework
......
......@@ -64,6 +64,7 @@ bool DataFeed::PickOneFile(std::string* filename) {
return false;
}
*filename = filelist_[file_idx_++];
LOG(ERROR) << "pick file:" << *filename;
return true;
}
......
......@@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
......@@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
......@@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass)
fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/analysis_var_pass.h"
#include <algorithm>
#include <atomic>
#include <deque>
#include <fstream>
#include <iostream>
#include <iterator>
#include <memory>
#include <queue>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
DEFINE_bool(enable_subgraph_optimize, false,
"SubGraph also reuse global graph variables, it will reduce the "
"memory occupation"
"but a higher risk of memory reuse error. default disabled.");
DEFINE_string(memory_optimize_debug, "",
"debug the operator output variable when do the variable reuse."
"memory reuse pass."
"only for debug, default disabled.");
namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
template <typename Container, typename Callback>
class FilterVariableImpl {
public:
void operator()(const Container& nodes, Callback callback) {
for (auto* node : nodes) {
callback(node);
}
}
};
// filter var node for op->inputs/outputs
template <typename Callback>
class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
public:
void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
for (auto* var : nodes) {
if (var->IsVar() && !var->IsCtrlVar()) {
callback(var);
}
}
}
};
template <typename Container, typename Callback>
void FilterVariables(const Container& nodes, Callback callback) {
FilterVariableImpl<Container, Callback>()(nodes, callback);
}
std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis();
InitSSAGraphNodes();
int reuse_id = 0;
for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) {
auto& op = cfg_->Ops()[idx];
auto* op_desc = op->Op();
// some op in graph has no op desc
if (op_desc == nullptr) continue;
if (OpHasSubBlock(op_desc)) {
if (FLAGS_enable_subgraph_optimize) {
SubGraphOptimize(op_desc);
} else {
VLOG(3) << op->Name()
<< " has subblock, but disable subgraph optimize. skipped.";
continue;
}
}
for (auto& var : op->outputs) {
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.NodeMatch(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache != nullptr) {
if (var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused."
<< var->Name() << " is re-filled to the pool after"
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
continue;
}
int node_idx_in_pool = pool_.GetIndex(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(reuse_id++), DebugString(var), DebugString(cache),
node_idx_in_pool, static_cast<int>(pool_.size()));
// update CFG Graph on the fly.
// reused var maybe re-fill into the pool
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
// NOTE(dzhwinter): we need to both update the ProgramDesc
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
}
}
}
// fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
if (var_node == nullptr) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node, op);
}
}
}
}
graph->ResolveHazard(var_nodes_);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for (auto& op : graph->Nodes()) {
for (auto& var : op->inputs) {
if (pool_.Has(var)) {
pool_.Insert(var, op);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
for (auto it = pool_.begin(); it != pool_.end(); ++it) {
std::unordered_set<OpDesc*> descs;
for (auto& op : it->second) {
PADDLE_ENFORCE(op->IsOp());
descs.insert(op->Op());
}
graph_pool.push_back(std::make_pair(it->first->Name(), descs));
}
return graph;
}
void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const {
// conditional block, while op and their grad op
auto* sub_block_desc =
AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
// create a mirror block to construct an IR Graph.
ProgramDesc prog;
auto* copy_block = prog.MutableBlock(0);
for (auto* op : sub_block_desc->AllOps()) {
auto* copy_op = copy_block->AppendOp();
copy_op->CopyFrom(*op);
copy_op->Flush();
}
for (auto* var : sub_block_desc->AllVars()) {
auto* copy_var = copy_block->Var(var->Name());
copy_var->SetDataType(var->GetDataType());
// only lod tensor can be reused. So ignore the multiple dims case.
copy_var->SetType(var->GetType());
copy_var->SetShape(var->GetShape());
copy_var->SetPersistable(var->Persistable());
}
ir::Graph sub_graph(prog);
std::unordered_set<ir::Node*> sub_graph_all_ops;
FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) {
// sub_graph_all_ops.emplace(var);
if (var->IsVar() && !var->IsCtrlVar()) {
sub_graph_all_ops.emplace(var);
}
});
int sub_reuse_id = 0;
// subgraph nodes is unordered, reuse need to follow the desc order.
// find the right op node through the descs
for (auto* sub_op_desc : sub_block_desc->AllOps()) {
ir::Node* sub_op = nullptr;
for (auto* node : sub_graph_all_ops) {
if (node->Op() == sub_op_desc) {
sub_op = node;
break;
}
}
PADDLE_ENFORCE(sub_op != nullptr);
for (auto* var : sub_op->outputs) {
if (NodeCanReused(var)) {
ir::Node* cache = pool_.NodeMatch(var);
if (cache != nullptr) {
if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
continue;
}
int node_idx_in_pool = pool_.GetIndex(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(sub_reuse_id++), DebugString(var),
DebugString(cache), node_idx_in_pool,
static_cast<int>(pool_.size()));
// NOTE(dzh): subblock is not in IR graph. Modify the block_desc
// immediately to make the subblock variable reuse strategy take
// effect. Because it is a single op in graph. No need to
// update the ir nodes.
sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block()->HasVar(var->Name())) {
sub_op_desc->Block()->RemoveVar(var->Name());
}
}
}
}
}
}
std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars(
const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars;
for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
vars.insert(inputs.begin(), inputs.end());
vars.insert(outputs.begin(), outputs.end());
}
}
return vars;
}
void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var,
size_t idx) const {
for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
auto* op = cfg_->Ops()[i];
PADDLE_ENFORCE(op->IsOp() && op->Op());
auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
op_desc->Flush();
}
}
void AnalysisVarPass::InitSSAGraphNodes() const {
std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
if (var_nodes_.empty()) {
for (auto* op : cfg_->Ops()) {
for (auto* node : op->inputs) {
if (all_vars[node->Name()].count(node) == 0) {
all_vars[node->Name()].emplace(node);
var_nodes_[node->Name()].emplace_back(node);
}
}
for (auto* node : op->outputs) {
if (all_vars[node->Name()].count(node) == 0) {
all_vars[node->Name()].emplace(node);
var_nodes_[node->Name()].emplace_back(node);
}
}
}
}
}
void AnalysisVarPass::RenameVarInGraphNode(const std::string& var,
const std::string& cache_var,
size_t idx, ir::Graph* graph) const {
// if replace happens, we need to create a newer version cache_var
// but use the same dims/data_type with var.
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
var_nodes_[var].at(0)->Var() != nullptr);
std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
var_desc->SetName(cache_var);
for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
auto* op = cfg_->Ops()[i];
// redirect the input to the latest version of cache_var
for (auto* node : op->inputs) {
if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(),
node->outputs.begin(), node->outputs.end());
PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
}
}
// if we need to rename the output,
// always create a newer version of cache_var
for (auto* node : op->outputs) {
if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache node
cache_node->outputs.insert(cache_node->outputs.end(),
node->outputs.begin(), node->outputs.end());
cache_node->inputs.emplace_back(op);
std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
}
}
}
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
}
bool AnalysisVarPass::NodeCanReused(ir::Node* node) const {
if (!node->IsVar() || node->IsCtrlVar()) return false;
auto* desc = node->Var();
auto type = desc->GetType();
if (desc->Persistable() || type != proto::VarType::LOD_TENSOR ||
desc->GetShape().empty()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std::string name = node->Name();
if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
return false;
if (skip_set_.count(name)) return false;
for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
return framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0;
}
}
return true;
}
bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const {
const AttributeMap& attrs = desc->GetAttrMap();
for (auto& attr : attrs) {
if (attr.second.type() == typeid(BlockDesc*) || // NOLINT
attr.second.type() == typeid(std::vector<BlockDesc*>)) // NOLINT
return true;
}
return false;
}
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
if (output->Name() == name) {
found_node = output;
}
}
}
return found_node;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// sort op in bfs order
std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph);
class ControlFlowGraph;
class AnalysisVarPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private:
// fill the variable map(var_nodes) by version.
void InitSSAGraphNodes() const;
// update program descs
void RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var, size_t idx) const;
// update ir nodes
void RenameVarInGraphNode(const std::string& var,
const std::string& cache_var, size_t idx,
ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const;
// valid a tensor can be reuse or not
bool NodeCanReused(ir::Node* node) const;
// scan subblock and collect the output/input variables.
std::unordered_set<std::string> GetSubBlockVars(
const std::unordered_set<ir::Node*>&) const;
// check op has subblock or not
bool OpHasSubBlock(OpDesc* desc) const;
private:
// Reuse Node Pool, Owned.
mutable OrderedNodePairPool pool_;
// controlflow Graph
mutable std::unique_ptr<ControlFlowGraph> cfg_;
// skip set
mutable std::unordered_set<std::string> skip_set_;
// var nodes
mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
};
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// For IR Graph in parallelexecutor
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/analysis_var_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
class DummyOp : public OperatorBase {
public:
DummyOp(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const Scope& scope,
const platform::Place& place) const override {}
};
class SumOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddComment("");
}
};
class AssignOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddComment("");
}
};
class DummyVarTypeInference : public VarTypeInference {
public:
void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
auto& inputs = op_desc.Input("X");
auto type = block->Var(inputs.front())->GetType();
auto out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetType(type);
}
};
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
template <typename Container>
inline static std::string DebugString(const Container& c) {
std::stringstream ss;
for (auto& item : c) {
ss << item << " ";
}
return ss.str();
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -14,11 +14,16 @@ limitations under the License. */
#include "paddle/fluid/framework/details/build_strategy.h"
#include <glog/logging.h>
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
......@@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
VLOG(1) << "CollectiveContext:" << context->String();
// NOTE(dzh): memory optimize should be a runtime pass.
// However, after multi_devices_pass, VarHandle, OpHandle is
// the de-fact IR, any reuse on Graph is meaningless.
// A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass");
}
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......@@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy_.debug_graphviz_path_);
const std::string graph_path =
string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(),
"_multi_devices_graph");
multi_devices_print_pass->Set<std::string>(kGraphvizPath,
new std::string(graph_path));
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
}
......@@ -127,7 +143,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
CreatePassesFromStrategy(false);
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") {
pass->Erase("places");
......@@ -145,6 +160,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "analysis_var_pass") {
const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership
graph->Set<GraphNodePool>(kGraphNodePool,
new GraphNodePool); // take ownership
pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
} else if (pass->Type() == "sequential_execution_pass") {
LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_;
......@@ -166,6 +192,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass);
USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass);
......@@ -60,8 +60,15 @@ struct BuildStrategy {
kCustomized = 2,
};
enum class OptimizeStrategy {
// To be Implemented,bruteforce, recursive compute unused var names.
kBruteForce = 0,
kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed.
};
ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
std::string debug_graphviz_path_{""};
......@@ -69,6 +76,10 @@ struct BuildStrategy {
bool enable_data_balance_{false};
bool memory_optimize_{false};
bool memory_early_delete_{false};
bool enable_sequential_execution_{false};
bool fuse_broadcast_op_{false};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace framework {
namespace details {
class EarlyDeleteOpHandle : public OpHandleBase {
public:
EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
const platform::Place& place,
const std::vector<std::string>& names,
GarbageCollector* gc)
: OpHandleBase(node),
scope_(scope),
place_(place),
names_(names),
gc_(gc) {
#ifdef PADDLE_WITH_CUDA
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(place);
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
}
#endif
}
~EarlyDeleteOpHandle() {
#ifdef PADDLE_WITH_CUDA
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventDestroy(event_));
}
#endif
}
std::string Name() const override { return "early_delete"; }
protected:
void RunImpl() override {
std::vector<std::shared_ptr<memory::Allocation>> tensors;
auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
for (auto& var_name : names_) {
auto* var = local_scope->FindVar(var_name);
PADDLE_ENFORCE(var != nullptr,
string::Sprintf("Local Scope not has var %s", var_name));
if (var->IsType<LoDTensor>()) {
tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
} else if (var->IsType<SelectedRows>()) {
tensors.emplace_back(var->GetMutable<SelectedRows>()
->mutable_value()
->MoveMemoryHolder());
} else if (var->IsType<LoDTensorArray>()) {
LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
for (auto& tensor : *tensor_array) {
tensors.emplace_back(tensor.MoveMemoryHolder());
}
}
}
if (!tensors.empty()) {
ClearTensors(tensors);
}
}
private:
void ClearTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
if (platform::is_cpu_place(place_)) {
ClearCPUTensors(tensors);
} else {
ClearGPUTensors(tensors);
}
}
void ClearCPUTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
if (gc != nullptr) {
gc->Add(tensors);
}
}
void ClearGPUTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
#ifdef PADDLE_WITH_CUDA
auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
if (gc != nullptr) {
auto compute_stream = dev_ctx_->stream();
auto callback_stream = gc->stream();
auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
};
gc_->Add(tensors, callback_func);
} else {
gc_->Add(tensors);
}
}
bool IsStreamGarabageCollector() const {
return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
#endif
}
const Scope* scope_;
const platform::Place place_;
std::vector<std::string> names_;
GarbageCollector* gc_;
#ifdef PADDLE_WITH_CUDA
platform::CUDADeviceContext* dev_ctx_;
cudaEvent_t event_;
#endif
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
#include <queue>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
std::queue<VarHandleBase*> queue;
queue.push(var_in);
do {
auto* var = queue.front();
queue.pop();
for (auto* op : var->PendingOps()) {
auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
return compute_op;
}
for (auto* out_var : op->Outputs()) {
queue.push(out_var);
}
}
} while (!queue.empty());
return nullptr;
}
std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
unlived_vars.reserve(graph_pool.size());
for (auto& pair : graph_pool) {
unlived_vars.insert(std::make_pair(pair.first, pair.second));
}
auto compare_and_insert_early_delete_op = [&](
OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
if (unlived_vars.empty()) return;
// unlived vars can be deleted after the last used op has finished.
auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
for (auto& var : vars) {
auto* var_handle = dynamic_cast<VarHandle*>(var);
auto var_name = var->Node()->Name();
auto& var_place = var_handle->place_;
if (unlived_vars.count(var_name) == 0) continue;
if (!unlived_vars[var_name].empty()) {
if (compute_op != nullptr &&
unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
unlived_vars[var_name].erase(compute_op->Node()->Op());
}
continue;
}
if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
var_handle->Node()->IsCtrlVar())
continue;
// shameless copyed from reference count pass.
if (compute_op == nullptr) {
// use next computation op scope
compute_op = FindNextComputationOpHandle(var_handle);
}
auto* early_delete_node =
graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
auto* early_delete_handle = new EarlyDeleteOpHandle(
early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
if (compute_op->Outputs().empty()) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
compute_op->AddOutput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
early_delete_handle->AddInput(compute_op->Outputs().front());
VLOG(5) << "Add early delete op " << var_name << " to Operator"
<< compute_op->Name();
}
};
auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
for (auto& op : all_ops) {
compare_and_insert_early_delete_op(op, op->Inputs());
compare_and_insert_early_delete_op(op, op->Outputs());
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(memory_early_delete_pass,
paddle::framework::details::MemoryEarlyDeletePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kGarbageCollector);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/early_delete_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
class MemoryEarlyDeletePass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include <iostream>
#include <sstream>
#include <string>
namespace paddle {
namespace framework {
namespace details {
size_t NodeSizeInBytes(ir::Node* n) {
auto* desc = FindVarDescInBlock(n);
auto shape = desc->GetShape();
size_t type_size = SizeOfType(desc->GetDataType());
int size = 1;
for (auto& s : shape) {
size *= s;
}
return type_size * std::abs(size);
}
std::string DebugStringImpl(VarDesc* var) {
std::stringstream ss;
ss << var->Name();
ss << "[";
try {
auto shape = var->GetShape();
for (size_t i = 0; i < shape.size(); ++i) {
if (i != shape.size() - 1) {
ss << shape[i] << ",";
} else {
ss << shape[i];
}
}
ss << "]";
} catch (...) {
ss << "Var has no VarDesc !!! Name:" << var->Name();
}
return ss.str();
}
std::string DebugString(ir::Node* var) {
return DebugStringImpl(FindVarDescInBlock(var));
}
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will
// have the small size. Find the original node shap from blockdesc.
VarDesc* FindVarDescInBlock(ir::Node* n) {
PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
BlockDesc* block = n->inputs[0]->Op()->Block();
PADDLE_ENFORCE(block->HasVar(n->Name()),
string::Sprintf("Block do not has var %s", n->Name()));
return block->FindVar(n->Name());
}
struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs);
auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
} else {
return false;
}
}
};
void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
PADDLE_ENFORCE(op->IsOp());
if (mark_table_.count(var->Name()) != 0) {
mark_table_[var->Name()]->second.insert(op);
return;
}
auto* var_desc = FindVarDescInBlock(var);
auto var_shape = var_desc->GetShape();
int batch_size = static_cast<int>(var_shape[0]);
NodeComparator compare_node;
Iter it = nodes_.begin();
while (it != nodes_.end()) {
auto* cache_desc = FindVarDescInBlock(it->first);
int cache_batch_size = cache_desc->GetShape()[0];
if ((cache_batch_size == -1 && batch_size == -1) ||
(cache_batch_size != -1 && batch_size != -1)) {
if (compare_node(it->first, var)) {
++it;
} else {
break;
}
} else if (cache_batch_size == -1 && batch_size != -1) {
++it;
} else if (cache_batch_size != -1 && batch_size == -1) {
break;
}
}
it =
nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
mark_table_[var->Name()] = it;
}
int OrderedNodePairPool::GetIndex(ir::Node* var) {
return std::distance(nodes_.begin(), mark_table_[var->Name()]);
}
ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
ir::Node* found_node = nullptr;
NodeComparator compare_node;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
if (compare_node(var, it->first)) {
found_node = it->first;
break;
}
}
return found_node;
}
void OrderedNodePairPool::Erase(ir::Node* var) {
PADDLE_ENFORCE(mark_table_.count(var->Name()));
nodes_.erase(mark_table_[var->Name()]);
mark_table_.erase(var->Name());
}
std::string OrderedNodePairPool::ToString() const {
std::stringstream ss;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
ss << DebugString(it->first) << " ";
}
return ss.str();
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <iterator>
#include <list>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
constexpr char kFetchedVars[] = "fetched_vars";
constexpr char kGraphNodePool[] = "graph_node_pool";
// NOTE(dzh): Variable and the operators use the var.
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using GraphNodePool = std::vector<
std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
// in fluid, -1 means the batch_size is determined in runtime.
// the node batch_size equal -1 always ranking in the front than the node not.
// For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class OrderedNodePairPool {
public:
using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
using Iter = typename std::list<NodePair>::iterator;
using ConstIter = typename std::list<NodePair>::const_iterator;
void Insert(ir::Node* var, ir::Node* op);
void Erase(ir::Node* var);
bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
ir::Node* NodeMatch(ir::Node* var) const;
// map store non-const iterator, can not promise const
int GetIndex(ir::Node* var);
// pool all node to string
std::string ToString() const;
Iter begin() { return nodes_.begin(); }
Iter end() { return nodes_.end(); }
ConstIter begin() const { return nodes_.begin(); }
ConstIter end() const { return nodes_.end(); }
size_t size() const { return nodes_.size(); }
private:
// for searching.
std::unordered_map<std::string, Iter> mark_table_;
// node swap pairs. var -> ops dep var
std::list<NodePair> nodes_;
};
// node memory size in bytes
size_t NodeSizeInBytes(ir::Node* n);
std::string DebugString(ir::Node* var);
// std::string DebugString(VarDesc* var);
VarDesc* FindVarDescInBlock(ir::Node* n);
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
namespace details {
TEST(OrderedNodePairPool, Normal) {
OrderedNodePairPool pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
// clang-format off
std::vector<std::vector<int64_t>> shapes = {{-1, 10},
{-1, 20},
{1, 2},
{5, 2},
{10, 20},
{-1, 2, 5},
{-1, 1, 5},
{-1, 1}};
// clang-format on
const int COUNT = shapes.size();
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
for (int i = 0; i < COUNT; ++i) {
auto desc = block_desc->Var(std::to_string(i));
desc->SetShape(shapes[i]);
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get(), op.get());
}
// assert its order and interface.
std::cout << pool.ToString() << std::endl;
pool.Erase(nodes.front().get());
std::cout << pool.ToString() << std::endl;
ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
{
auto v1 = block_desc->Var("11");
v1->SetShape({-1, 256, 56, 56});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get());
ASSERT_EQ(cache, nullptr);
}
{
auto v2 = block_desc->Var("12");
v2->SetShape({-1, 2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5]
}
{
auto v3 = block_desc->Var("13");
v3->SetShape({2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2]
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
} // namespace paddle
REGISTER_PASS(multi_devices_print_pass,
paddle::framework::details::SSAGraghBuilderWithPrinter);
paddle::framework::details::SSAGraghBuilderWithPrinter)
.RequirePassAttr(paddle::framework::details::kGraphvizPath);
......@@ -14,6 +14,7 @@
#pragma once
#include <glog/logging.h>
#include <fstream>
#include <iosfwd>
#include <ostream>
......@@ -24,6 +25,8 @@ namespace paddle {
namespace framework {
namespace details {
constexpr char kGraphvizPath[] = "debug_graphviz_path";
class SSAGraphPrinter {
public:
virtual ~SSAGraphPrinter() {}
......@@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass {
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
std::unique_ptr<std::ostream> fout(
new std::ofstream(Get<const std::string>("debug_graphviz_path")));
new std::ofstream(Get<std::string>(kGraphvizPath)));
PADDLE_ENFORCE(fout->good());
Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
return graph;
......
......@@ -25,7 +25,7 @@ namespace paddle {
namespace framework {
namespace details {
constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@";
// Wraps ir::Node and provide helper utilities.
// It's responsible for populating necessary fields of ir::Node.
......
......@@ -100,7 +100,7 @@ static void DeleteUnusedTensors(
continue;
}
auto* var = scope.FindVar(name);
if (var != nullptr) {
if (var == nullptr) {
continue;
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/executor_thread_worker.h"
#include <algorithm>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
......@@ -32,6 +33,89 @@ limitations under the License. */
namespace paddle {
namespace framework {
#ifdef PADDLE_WITH_PSLIB
int DensePullThread::start() {
_running = true;
_t = std::thread(&DensePullThread::run, this);
return 0;
}
void DensePullThread::run() {
while (_running) {
_pull_dense_status.resize(0);
for (auto& t : _dense_variable_name) {
if (check_update_param(t.first)) {
auto status = pull_dense(t.first);
_pull_dense_status.emplace_back(std::move(status));
reset_thread_version(t.first);
}
}
if (_pull_dense_status.size() != 0) {
wait_all();
}
usleep(_sleep_time_ms * 1000);
}
}
bool DensePullThread::check_update_param(uint64_t table_id) {
{
std::lock_guard<std::mutex> lock(_mutex_for_version);
auto& version = _training_versions[table_id];
_current_version[table_id] =
*(std::min_element(version.begin(), version.end()));
}
if (_current_version[table_id] - _last_versions[table_id] < _threshold) {
return false;
}
return true;
}
void DensePullThread::reset_thread_version(uint64_t table_id) {
std::lock_guard<std::mutex> lock(_mutex_for_version);
_last_versions[table_id] = _current_version[table_id];
}
std::future<int32_t> DensePullThread::pull_dense(uint64_t table_id) {
auto& regions = _regions[table_id];
regions.clear();
auto& variables = _dense_variable_name[table_id];
regions.resize(variables.size());
for (auto i = 0u; i < variables.size(); ++i) {
auto& t = variables[i];
Variable* var = _root_scope->FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>();
paddle::ps::Region reg(w, tensor->numel());
regions[i] = std::move(reg);
}
return _ps_client->pull_dense(regions.data(), regions.size(), table_id);
}
void DensePullThread::wait_all() {
for (auto& t : _pull_dense_status) {
t.wait();
auto status = t.get();
if (status != 0) {
LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times;
}
}
if (_pull_dense_fail_times > 20) {
LOG(FATAL) << "pull dense failed times more than 20 times";
exit(-1);
}
_pull_dense_status.resize(0);
}
void DensePullThread::increase_thread_version(int thread_id,
uint64_t table_id) {
std::lock_guard<std::mutex> lock(_mutex_for_version);
_training_versions[table_id][thread_id]++;
}
#endif
void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
auto& block = program.Block(0);
op_names_.clear();
......@@ -202,5 +286,358 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
root_scope_ = g_scope;
}
#ifdef PADDLE_WITH_PSLIB
// AsyncExecutor
void AsyncExecutorThreadWorker::TrainFiles() {
SetDevice();
int fetch_var_num = fetch_var_names_.size();
fetch_values_.clear();
fetch_values_.resize(fetch_var_num);
thread_reader_->Start();
int cur_batch;
int batch_cnt = 0;
while ((cur_batch = thread_reader_->Next()) > 0) {
// executor run here
TrainOneNetwork();
++batch_cnt;
thread_scope_->DropKids();
if (debug_ == false || thread_id_ != 0) {
continue;
}
for (int i = 0; i < fetch_var_num; ++i) {
print_fetch_var(thread_scope_, fetch_var_names_[i]);
} // end for (int i = 0...)
} // end while ()
}
void AsyncExecutorThreadWorker::SetPSlibPtr(
std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
_pslib_ptr = pslib_ptr;
}
void AsyncExecutorThreadWorker::SetPullDenseThread(
std::shared_ptr<DensePullThread> dpt) {
_pull_dense_thread = dpt;
}
void AsyncExecutorThreadWorker::TrainOneNetwork() {
PrepareParams();
for (auto& op : ops_) {
if (op->Type().find("sgd") != std::string::npos) {
continue;
}
bool need_skip = false;
for (auto t = 0u; t < _param_config->skip_op.size(); ++t) {
if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) {
need_skip = true;
break;
}
}
if (!need_skip) {
op->Run(*thread_scope_, place_);
}
}
UpdateParams();
}
void AsyncExecutorThreadWorker::SetParamConfig(
AsyncWorkerParamConfig* param_config) {
_param_config = param_config;
}
void AsyncExecutorThreadWorker::PrepareParams() {
for (auto table_id : _param_config->sparse_table_id) {
PullSparse(table_id);
for (auto& t : _pull_sparse_status) {
t.wait();
auto status = t.get();
if (status != 0) {
LOG(ERROR) << "pull sparse failed, status[" << status << "]";
exit(-1);
}
}
}
_pull_sparse_status.resize(0);
for (auto table_id : _param_config->sparse_table_id) {
FillSparse(table_id);
}
}
void AsyncExecutorThreadWorker::UpdateParams() {
for (auto i : _param_config->sparse_table_id) {
PushSparse(i);
}
for (auto i : _param_config->dense_table_id) {
PushDense(i);
}
int32_t tmp_push_dense_wait_times = -1;
int32_t tmp_push_sparse_wait_times = -1;
static uint32_t push_dense_wait_times =
static_cast<uint32_t>(tmp_push_dense_wait_times);
static uint32_t push_sparse_wait_times =
static_cast<uint32_t>(tmp_push_sparse_wait_times);
if (_push_dense_status.size() >= push_dense_wait_times) {
for (auto& t : _push_dense_status) {
t.wait();
}
_push_dense_status.resize(0);
}
if (tmp_push_dense_wait_times == -1) {
_push_dense_status.resize(0);
}
if (_push_sparse_status.size() >= push_sparse_wait_times) {
for (auto& t : _push_sparse_status) {
t.wait();
}
_push_sparse_status.resize(0);
}
if (tmp_push_sparse_wait_times == -1) {
_push_sparse_status.resize(0);
}
for (auto dense_table_id : _param_config->dense_table_id) {
_pull_dense_thread->increase_thread_version(thread_id_, dense_table_id);
}
}
void AsyncExecutorThreadWorker::PushDense(int table_id) {
std::vector<paddle::ps::Region> regions;
for (auto& t : _param_config->dense_gradient_variable_name[table_id]) {
Variable* var = thread_scope_->FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g = tensor->data<float>();
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
_push_dense_status.push_back(std::move(status));
}
void AsyncExecutorThreadWorker::PullSparse(int table_id) {
auto& features = _features[table_id];
auto& feature_value = _feature_value[table_id];
auto fea_dim = _param_config->fea_dim;
// slot id starts from 1
features.clear();
features.resize(0);
features.reserve(MAX_FEASIGN_NUM);
const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
// slot_idx = 0 is label TODO
for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* ids = tensor->data<int64_t>();
int len = tensor->numel();
for (auto i = 0u; i < len; ++i) {
// todo(colourful-tree): current trick - filter feasign=use_slot_mod(
// bug: datafeed fill use_slot_mod for empty slot)
if (ids[i] == 0u) {
continue;
}
features.push_back(static_cast<uint64_t>(ids[i]));
}
}
check_pull_push_memory(features, &feature_value, fea_dim);
std::vector<float*> pull_feature_value;
for (auto i = 0u; i < features.size(); ++i) {
pull_feature_value.push_back(feature_value[i].data());
}
auto status = _pslib_ptr->_worker_ptr->pull_sparse(
pull_feature_value.data(), table_id, features.data(), features.size());
_pull_sparse_status.push_back(std::move(status));
auto& push_g = _feature_push_value[table_id];
check_pull_push_memory(features, &push_g, fea_dim);
collect_feasign_info(table_id);
}
void AsyncExecutorThreadWorker::FillSparse(int table_id) {
auto slot_dim = _param_config->slot_dim;
auto fea_dim = _param_config->fea_dim;
auto& features = _features[table_id];
auto& fea_value = _feature_value[table_id];
CHECK(features.size() > 0) << "feature size check failed";
auto fea_idx = 0u;
std::vector<float> init_value(fea_dim);
const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
// slot_idx = 0 is label TODO
for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* ids = tensor->data<int64_t>();
int len = tensor->numel();
Variable* var_emb = thread_scope_->FindVar(
_param_config->slot_input_vec[table_id][slot_idx - 1]);
LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
float* ptr =
tensor_emb->mutable_data<float>({len, slot_dim}, platform::CPUPlace());
memset(ptr, 0, sizeof(float) * len * slot_dim);
auto& tensor_lod = tensor->lod()[0];
LoD data_lod{tensor_lod};
tensor_emb->set_lod(data_lod);
for (auto index = 0u; index < len; ++index) {
if (ids[index] == 0u) {
memcpy(ptr + slot_dim * index, init_value.data() + 2,
sizeof(float) * slot_dim);
continue;
}
memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2,
sizeof(float) * slot_dim);
fea_idx++;
}
}
}
void AsyncExecutorThreadWorker::PushSparse(int table_id) {
auto slot_dim = _param_config->slot_dim;
auto fea_dim = _param_config->fea_dim;
auto& features = _features[table_id];
auto& push_g = _feature_push_value[table_id];
check_pull_push_memory(features, &push_g, fea_dim);
CHECK(push_g.size() == features.size() + 1)
<< "push_g size:" << push_g.size()
<< " features size:" << features.size();
uint64_t fea_idx = 0u;
auto& fea_info = _fea_info[table_id];
int offset = 2;
const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
// slot_idx = 0 is label
for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) ==
_param_config->slot_alias_to_table.end()) {
LOG(ERROR) << "ERROR slot_idx:" << slot_idx
<< " name:" << feed_vec[slot_idx];
} else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] !=
table_id) {
continue;
}
Variable* g_var = thread_scope_->FindVar(
_param_config->gradient_var[table_id][slot_idx - 1]);
CHECK(g_var != nullptr)
<< "var[" << _param_config->gradient_var[table_id][slot_idx - 1]
<< "] not found";
LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
if (g_tensor == NULL) {
LOG(ERROR) << "var["
<< _param_config->gradient_var[table_id][slot_idx - 1]
<< "] not found";
exit(-1);
}
float* g = g_tensor->data<float>();
Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == NULL) {
LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found";
exit(-1);
}
int len = tensor->numel();
CHECK(slot_dim * len == g_tensor->numel())
<< "len:" << len << " g_numel:" << g_tensor->numel();
CHECK(len == tensor->numel()) << "len:" << len
<< "t_numel:" << tensor->numel();
int64_t* ids = tensor->data<int64_t>();
for (auto id_idx = 0u; id_idx < len; ++id_idx) {
if (ids[id_idx] == 0) {
g += slot_dim;
continue;
}
memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
push_g[fea_idx][0] = 1.0f;
CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
<< " size:" << fea_info.size();
push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
g += slot_dim;
fea_idx++;
}
}
CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
<< " features size:" << features.size();
CHECK_GT(features.size(), 0);
std::vector<float*> push_g_vec;
for (auto i = 0u; i < features.size(); ++i) {
push_g_vec.push_back(push_g[i].data());
}
auto status = _pslib_ptr->_worker_ptr->push_sparse(
table_id, features.data(), (const float**)push_g_vec.data(),
features.size());
_push_sparse_status.push_back(std::move(status));
}
void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) {
auto& fea_info = _fea_info[table_id];
auto& feature = _features[table_id];
fea_info.resize(feature.size());
const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
Variable* var = thread_scope_->FindVar(feed_vec[0]);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* label = tensor->data<int64_t>();
int global_index = 0;
for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* ids = tensor->data<int64_t>();
int fea_idx = 0;
for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) {
for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) {
if (ids[fea_idx] == 0u) {
continue;
}
FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]};
fea_info[global_index++] = std::move(info);
}
}
}
CHECK(global_index == feature.size())
<< "expect fea info size:" << feature.size() << " real:" << global_index;
}
void AsyncExecutorThreadWorker::check_pull_push_memory(
const std::vector<uint64_t>& features,
std::vector<std::vector<float>>* push_g, int dim) {
push_g->resize(features.size() + 1);
for (auto& t : *push_g) {
t.resize(dim);
}
}
void AsyncExecutorThreadWorker::check_pull_push_memory(
const std::vector<uint64_t>& features, std::vector<float*>* push_g,
int dim) {
if (features.size() > push_g->size()) {
push_g->reserve(features.size() + 1);
auto size = features.size() - push_g->size() + 1;
for (auto i = 0u; i < size; ++i) {
float* ptr = new float[dim];
push_g->push_back(ptr);
}
}
}
#endif
} // einit_modelnd namespace framework
} // end namespace paddle
......@@ -25,16 +25,119 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_PSLIB
#include <pslib.h>
#endif
namespace paddle {
namespace framework {
void CreateTensor(Variable* var, proto::VarType::Type var_type);
#ifdef PADDLE_WITH_PSLIB
static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
struct AsyncWorkerParamConfig {
int slot_dim;
int fea_dim;
int32_t tmp_push_dense_wait_times;
int32_t tmp_push_sparse_wait_times;
std::vector<std::string> skip_op;
std::map<uint64_t, std::vector<std::string>> dense_variable_name;
std::map<uint64_t, std::vector<std::string>> dense_gradient_variable_name;
std::vector<int> dense_table_id;
// fea_dim for each dense table
std::vector<uint32_t> dense_table_size;
std::vector<int> sparse_table_id;
std::map<uint64_t, std::vector<std::string>> slot_input_vec;
std::map<uint64_t, std::vector<std::string>> gradient_var;
std::map<std::string, uint64_t> slot_alias_to_table;
};
struct DensePullThreadParam {
std::shared_ptr<paddle::ps::PSClient> ps_client;
int threshold;
int training_thread_num;
Scope* root_scope;
std::map<uint64_t, std::vector<std::string>>* dense_params;
int sleep_time_ms = 2;
};
class DensePullThread {
public:
explicit DensePullThread(const DensePullThreadParam& param)
: _running(false) {
_ps_client = param.ps_client;
_threshold = param.threshold;
_thread_num = param.training_thread_num;
_root_scope = param.root_scope;
_sleep_time_ms = param.sleep_time_ms;
for (auto& t : *param.dense_params) {
_dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(),
t.second.begin(), t.second.end());
_training_versions[t.first].resize(_thread_num, 0);
_last_versions[t.first] = 0;
_current_version[t.first] = 0;
}
}
int start();
void stop() {
if (_running) {
_running = false;
_t.join();
}
}
void increase_thread_version(int thread_id, uint64_t table_id);
void reset_thread_version(uint64_t table_id);
std::future<int32_t> pull_dense(uint64_t table_id);
void pull_dense2(uint64_t table_id);
void wait_all();
private:
void run();
bool check_update_param(uint64_t table_id);
private:
std::shared_ptr<paddle::ps::PSClient> _ps_client;
int _thread_num;
int _threshold;
int _sleep_time_ms;
Scope* _root_scope;
bool _running;
std::map<uint64_t, uint64_t> _last_versions;
std::map<uint64_t, uint64_t> _current_version;
std::mutex _mutex_for_version;
std::map<uint64_t, std::vector<uint64_t>> _training_versions;
std::map<uint64_t, std::vector<std::string>> _dense_variable_name;
std::thread _t;
std::vector<::std::future<int32_t>> _pull_dense_status;
std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
uint32_t _pull_dense_fail_times = 0;
std::vector<float> _base_norm_param;
std::vector<float> _mean;
std::vector<float> _scale;
float _squared_sum_epsilon = 1e-4;
std::mutex _mutex_for_mean_scale;
float _total_batch_num = 0;
};
#endif
class ExecutorThreadWorker {
public:
ExecutorThreadWorker()
: thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
~ExecutorThreadWorker() {}
virtual ~ExecutorThreadWorker() {}
void CreateThreadResource(const framework::ProgramDesc& program,
const paddle::platform::Place& place);
......@@ -51,9 +154,15 @@ class ExecutorThreadWorker {
// set data feed declared in executor
void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
// A multi-thread training function
void TrainFiles();
virtual void TrainFiles();
// set fetch variable names from python interface assigned by users
void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
#ifdef PADDLE_WITH_PSLIB
virtual void SetPSlibPtr(
std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {}
virtual void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt) {}
virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}
#endif
private:
void CreateThreadScope(const framework::ProgramDesc& program);
......@@ -77,12 +186,58 @@ class ExecutorThreadWorker {
Scope* root_scope_;
// a thread scope, father scope is global score which is shared
Scope* thread_scope_;
private:
std::vector<std::string> fetch_var_names_;
std::vector<std::vector<float>> fetch_values_;
bool debug_;
};
#ifdef PADDLE_WITH_PSLIB
class AsyncExecutorThreadWorker : public ExecutorThreadWorker {
public:
AsyncExecutorThreadWorker() {}
virtual ~AsyncExecutorThreadWorker() {}
void SetPSlibPtr(std::shared_ptr<paddle::distributed::PSlib> pslib_ptr);
void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt);
void SetParamConfig(AsyncWorkerParamConfig* param_config);
void TrainFiles();
void TrainOneNetwork();
void PrepareParams();
void UpdateParams();
void PullSparse(int table_id);
void FillSparse(int table_id);
void PushSparse(int table_id);
void PushDense(int table_id);
void check_pull_push_memory(const std::vector<uint64_t>& features,
std::vector<float*>* push_g, int dim);
void check_pull_push_memory(const std::vector<uint64_t>& features,
std::vector<std::vector<float>>* push_g, int dim);
void collect_feasign_info(int table_id);
private:
struct FeasignInfo {
uint32_t slot;
uint32_t ins;
int64_t label;
};
std::map<uint64_t, std::vector<uint64_t>> _features;
std::map<uint64_t, std::vector<FeasignInfo>> _fea_info;
std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
std::shared_ptr<DensePullThread> _pull_dense_thread;
std::vector<::std::future<int32_t>> _pull_sparse_status;
std::vector<::std::future<int32_t>> _pull_dense_status;
std::vector<::std::future<int32_t>> _push_sparse_status;
std::vector<::std::future<int32_t>> _push_dense_status;
AsyncWorkerParamConfig* _param_config;
};
#endif
} // namespace framework
} // namespace paddle
......@@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference)
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(conv_op); \
GET_IR_NODE(conv_out); \
GET_IR_NODE(conv_filter); \
GET_IR_NODE(elementwise_add_op); \
GET_IR_NODE(elementwise_add_in_y); \
GET_IR_NODE(elementwise_add_out);
std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name = "conv_elementwise_add_fuse";
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_op_input("conv2d", "Input")
->AsInput();
patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name);
pattern(x);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_NODES;
auto base_op_desc = *conv_op->Op()->Proto();
std::string bias_name = elementwise_add_in_y->Name();
std::string output_name = elementwise_add_out->Name();
std::string act_type = "identity";
framework::OpDesc new_op_desc(base_op_desc, nullptr);
new_op_desc.SetType("conv2d_fusion");
new_op_desc.SetInput("Bias", {bias_name});
new_op_desc.SetInput("ResidualData", {});
new_op_desc.SetAttr("activation", act_type);
new_op_desc.SetOutput("Output", {output_name});
new_op_desc.SetAttr("is_test", true);
new_op_desc.SetAttr("use_cudnn", false);
new_op_desc.Flush();
// Create a new node for the fused op.
auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
// Link inputs and outputs.
PADDLE_ENFORCE(subgraph.count(x));
auto* conv_in_node = subgraph.at(x);
IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input
IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter
IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias
IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op});
};
gpd(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_elementwise_add_fuse_pass,
paddle::framework::ir::ConvElementwiseAddFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class ConvElementwiseAddFusePass : public FusePassBase {
public:
virtual ~ConvElementwiseAddFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -162,7 +162,10 @@ void Graph::ResolveHazard(
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs;
PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
PADDLE_ENFORCE(
write_op,
string::Sprintf("The write_op of var %s should not be empty.",
(*it_new)->Name()));
// Add write after write dependence
ir::Node *upstream_op =
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_map>
#include <unordered_set>
DEFINE_string(print_sub_graph_dir, "",
......@@ -121,7 +122,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
}
size_t GraphNum(const Graph &graph) {
std::unordered_set<ir::Node *> nodes = graph.Nodes();
std::unordered_set<ir::Node *> nodes(graph.Nodes());
std::unordered_set<ir::Node *> visited_nodes;
visited_nodes.reserve(nodes.size());
std::deque<ir::Node *> q_nodes;
......
......@@ -24,6 +24,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
namespace ir {
// Test if the graph contains circle.
bool HasCircle(const Graph &graph);
......
......@@ -17,7 +17,6 @@
#include <string>
#include <vector>
#include "graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
......@@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
return act_out;
}
PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
conv_in->AsInput();
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
auto conv_out = pattern->NewNode(conv_out_repr())
->assert_is_op_output("conv2d")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto conv_filter = pattern->NewNode(conv_filter_repr())
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
->assert_is_op("elementwise_add");
auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
->assert_is_op_output("elementwise_add")
->AsOutput();
conv_op->LinksFrom({conv_in, conv_filter});
conv_out->LinksFrom({conv_op});
elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
.LinksTo({elementwise_add_out});
return elementwise_add_out;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase {
PATTERN_DECL_NODE(act_out);
};
// Conv + ElementwiseAdd
// This pattern should be used after ConvElementwiseadd2Act or
// ConvElementwiseadd pass
struct ConvElementwiseadd : public PatternBase {
ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_elementwiseadd") {}
PDNode* operator()(PDNode* conv_in);
PATTERN_DECL_NODE(conv_op);
PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(conv_filter);
PATTERN_DECL_NODE(elementwise_add_op);
PATTERN_DECL_NODE(elementwise_add_in_y);
PATTERN_DECL_NODE(elementwise_add_out);
};
} // namespace patterns
// Link two ir::Nodes from each other.
......
......@@ -30,6 +30,14 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
return std::unique_ptr<Node>(new Node(name, type));
}
std::unique_ptr<Node> CreateNodeForTest(VarDesc *var_desc) {
return std::unique_ptr<Node>(new Node(var_desc));
}
std::unique_ptr<Node> CreateNodeForTest(OpDesc *op_desc) {
return std::unique_ptr<Node>(new Node(op_desc));
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -18,7 +18,6 @@ limitations under the License. */
#include <typeindex>
#include <typeinfo>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/macros.h"
......@@ -125,6 +124,8 @@ class Node {
friend class Graph;
friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
Node::Type type);
friend std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
friend std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
explicit Node(const std::string& name, Type type)
: name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
......@@ -152,7 +153,9 @@ class Node {
std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
Node::Type type);
std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm>
#include <string>
#include <tuple>
#include <vector>
......@@ -93,6 +94,7 @@ class ParallelExecutorPrivate {
}
}
BuildStrategy build_strategy_;
std::vector<platform::Place> places_;
std::vector<Scope *> local_scopes_;
Scope *global_scope_; // not owned
......@@ -169,6 +171,14 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
graph = eager_deletion_pass->Apply(std::move(graph));
VLOG(10) << "EagerDeletionPass Applied";
if (build_strategy_.memory_early_delete_) {
auto early_delete_pass =
ir::PassRegistry::Instance().Get("memory_early_delete_pass");
early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
graph = early_delete_pass->Apply(std::move(graph));
}
VLOG(10) << "MemoryEarlyDeletePass Applied.";
}
return graph;
......@@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor(
: member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_;
member_->build_strategy_ = build_strategy;
member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
......@@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor(
build_strategy.Apply(main_program, member_->places_, loss_var_name,
params, member_->local_scopes_, member_->use_cuda_);
#endif
auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) {
graph = member_->PrepareGCAndRefCnts(std::move(graph),
......@@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor(
if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph)));
exec_strategy, member_->local_scopes_, member_->places_,
std::move(graph)));
} else {
member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph)));
exec_strategy, member_->local_scopes_, member_->places_,
std::move(graph)));
}
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
......@@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework
} // namespace paddle
USE_PASS(memory_early_delete_pass);
USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass);
......@@ -74,6 +74,22 @@ TEST(Tensor, MutableData) {
p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CPUPlace());
EXPECT_EQ(p1, p2);
float* p3 = nullptr;
float* p4 = nullptr;
// set src_tensor a different type but smaller size.
// memory block is supposed to be unchanged.
auto* tmp = src_tensor.mutable_data<uint8_t>(framework::make_ddim({2, 2}),
platform::CPUPlace());
p3 = reinterpret_cast<float*>(tmp);
EXPECT_EQ(p1, p3);
// set src_tensor a different type but bigger size.
// memory block is supposed to be changed.
auto* tmp2 = src_tensor.mutable_data<double>(
framework::make_ddim({2, 2, 3}), platform::CPUPlace());
p4 = reinterpret_cast<float*>(tmp2);
EXPECT_NE(p1, p4);
}
// Not sure if it's desired, but currently, Tensor type can be changed.
{
......
......@@ -26,9 +26,6 @@ endif(WIN32)
# paddle_fluid_origin exclude inference api interface
if(WIN32)
sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
endif(WIN32)
......@@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS
if(WIN32)
sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
......@@ -63,9 +57,6 @@ if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
......
......@@ -63,7 +63,6 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
Graph *graph) const {
auto *op_desc = node->Op();
static int counter{0};
auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty());
......@@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
block_desc.Proto()->SerializeAsString());
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "engine_uniq_key",
"trt-" + std::to_string(counter++));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
}
......
......@@ -15,12 +15,43 @@ macro(safe_set_static_flag)
endforeach(flag_var)
endmacro()
if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
endif()
if(NOT DEFINED DEMO_NAME)
message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
endif()
include_directories("${PADDLE_LIB}/")
include_directories("${PADDLE_LIB}/fluid_inference_install_dir/")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3")
link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
link_directories("${PADDLE_LIB}/paddle/lib")
if (WIN32)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
if (WITH_STATIC_LIB)
safe_set_static_flag()
add_definitions(-DSTATIC_LIB)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w")
set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w")
endif()
set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
else()
......@@ -29,36 +60,15 @@ else()
endif()
message("flags" ${CMAKE_CXX_FLAGS})
if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
endif()
if(NOT DEFINED DEMO_NAME)
message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
endif()
if(WITH_GPU)
if(NOT WIN32)
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
else()
if(CUDA_LIB STREQUAL "")
set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
endif()
endif(NOT WIN32)
endif()
include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
if (NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
endif(NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3")
if (NOT WIN32)
if (USE_TENSORRT AND WITH_GPU)
......@@ -67,18 +77,6 @@ if (NOT WIN32)
endif()
endif(NOT WIN32)
if (NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
endif(NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
link_directories("${PADDLE_LIB}/paddle/lib")
if (NOT WIN32)
set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
if(EXISTS ${NGRAPH_PATH})
......@@ -89,8 +87,6 @@ if (NOT WIN32)
endif()
endif()
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
......@@ -106,26 +102,25 @@ endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
if(WITH_STATIC_LIB)
set(DEPS
${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
else()
set(DEPS
${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
glog gflags protobuf snappystream snappy z xxhash
${EXTERNAL_LIB})
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
glog gflags protobuf snappystream snappy z xxhash
${EXTERNAL_LIB})
else()
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${EXTERNAL_LIB})
# NOTE(dzhwinter) shlwapi is deprecated.
set(DEPS ${DEPS} libcmt shlwapi)
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
snappystream ${EXTERNAL_LIB})
# NOTE(dzhwinter) shlwapi is deprecated.
set(DEPS ${DEPS} libcmt shlwapi)
endif(NOT WIN32)
if(WITH_GPU)
......@@ -137,9 +132,10 @@ if(WITH_GPU)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
else()
set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
endif()
endif()
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
target_link_libraries(${DEMO_NAME} ${DEPS})
......@@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy {
"conv_bn_fuse_pass", //
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
});
}
......
......@@ -103,6 +103,7 @@ class OpConverter {
void ConvertBlock(const framework::proto::BlockDesc& block,
const std::unordered_set<std::string>& parameters,
const framework::Scope& scope, TensorRTEngine* engine) {
std::unique_lock<std::mutex> lk(mut_);
for (int i = 0; i < block.ops_size(); i++) {
const auto& op = block.ops(i);
ConvertOp(op, parameters, scope, engine);
......@@ -125,6 +126,7 @@ class OpConverter {
std::unordered_map<std::string, OpConverter*> converters_;
// fluid inference scope
framework::Scope* scope_{nullptr};
std::mutex mut_;
};
} // namespace tensorrt
......
......@@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
ARGS --infer_model=${install_dir}/model)
endfunction()
function(inference_analysis_api_test_with_refer_result target install_dir filename)
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt
--refer_result=${install_dir}/result.txt)
endfunction()
# RNN1
if(NOT APPLE AND WITH_MKLML)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
......@@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
# mobilenet with transpose op
set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
endif()
inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
# resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
# mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
# anakin
......
......@@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) {
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
const float ocr_result_data[] = {
5.273636460856323538e-08, 3.296741795111302054e-07,
1.873261190610264748e-08, 3.403730275408634043e-08,
3.383312474625199684e-08};
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
for (size_t i = 0; i < std::min(5UL, size); i++) {
EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
std::string line;
std::ifstream file(FLAGS_refer_result);
std::getline(file, line);
auto refer = ProcessALine(line);
file.close();
auto &output = outputs.front();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
CHECK_EQ(numel, refer.data.size());
for (size_t i = 0; i < numel; ++i) {
CHECK_LT(
fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
1e-5);
}
}
}
......
......@@ -36,6 +36,7 @@
DEFINE_string(model_name, "", "model name");
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file");
DEFINE_string(refer_result, "", "reference result for comparison");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
static constexpr char kInputs[] = "inputs";
static constexpr char kParameters[] = "parameters";
static constexpr char kPlaces[] = "places";
static constexpr char kOutputs[] = "outputs";
static constexpr char kParallelScopes[] = "parallel_scopes";
static constexpr char kParallelBlock[] = "sub_block";
static constexpr char kUseNCCL[] = "use_nccl";
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
static void SplitTensorAndMoveTensorToScopes(
const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
const std::vector<platform::Place> &places,
const std::vector<std::string> &names) {
size_t num_sub_scopes = 0;
for (auto &argu : names) {
const auto &tensor =
detail::Ref(scope.FindVar(argu),
"Cannot find variable %s in the parent scope", argu)
.Get<LoDTensor>();
auto lod_tensors = tensor.SplitLoDTensor(places);
for (auto &lod : lod_tensors) {
VLOG(3) << lod.dims();
}
if (num_sub_scopes == 0) {
num_sub_scopes = lod_tensors.size();
} else {
PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
}
PADDLE_ENFORCE_NE(num_sub_scopes, 0);
if (sub_scopes->size() == 0) {
sub_scopes->reserve(num_sub_scopes);
for (size_t i = 0; i < num_sub_scopes; ++i) {
sub_scopes->emplace_back(&scope.NewScope());
}
}
for (size_t i = 0; i < lod_tensors.size(); ++i) {
*detail::Ref(sub_scopes->at(i)->Var(argu),
"Cannot find variable in the sub-scope", argu)
.GetMutable<LoDTensor>() = lod_tensors[i];
}
}
}
inline void CopyOrShare(const framework::Variable &src,
const platform::Place &dst_place,
framework::Variable *dst) {
if (src.IsType<LoDTensor>()) {
if (src.Get<LoDTensor>().place() == dst_place) {
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else {
TensorCopy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
}
} else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>();
auto *dst_sr = dst->GetMutable<SelectedRows>();
dst_sr->set_height(src_sr.height());
if (src_sr.value().place() == dst_place) {
dst_sr->mutable_value()->ShareDataWith(src_sr.value());
dst_sr->set_rows(src_sr.rows());
} else {
TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
}
} else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
}
}
void WaitOnPlace(const platform::Place place) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
dev_ctx.Wait();
}
void WaitOnPlaces(const std::vector<platform::Place> places) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
for (auto &place : places) {
auto &dev_ctx = *pool.Get(place);
dev_ctx.Wait();
}
}
class ParallelDoOp : public framework::OperatorBase {
public:
ParallelDoOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
auto *program = block->Program();
auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
->GetMutable<std::vector<framework::Scope *>>();
// split input
SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
Inputs(kInputs));
// copy parameter
for (auto &param : Inputs(kParameters)) {
PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
"Only support parameter type as LoDTensor");
auto &src = scope.FindVar(param)->Get<LoDTensor>();
auto *sub_scope0 = sub_scopes[0];
auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
dst0->ShareDataWith(src);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
auto &place = places[i];
auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::TensorCopy(src, place, dst);
}
}
WaitOnPlaces(places);
std::vector<std::future<void>> workers;
workers.reserve(places.size());
for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
auto &place = places[place_idx];
auto *cur_scope = sub_scopes[place_idx];
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
}
for (auto &worker : workers) {
worker.wait();
}
WaitOnPlaces(places);
// merge output
for (auto &o_name : Outputs(kOutputs)) {
std::vector<const framework::LoDTensor *> lod_tensors;
lod_tensors.reserve(sub_scopes.size());
for (auto *sub_scope : sub_scopes) {
lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
}
auto *lod_tensor_to_be_merged =
scope.FindVar(o_name)->GetMutable<LoDTensor>();
lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
}
WaitOnPlaces(places);
}
};
class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(kInputs, "").AsDuplicable();
AddInput(kParameters, "").AsDuplicable();
AddInput(kPlaces, "");
AddOutput(kOutputs, "").AsDuplicable();
AddOutput(kParallelScopes, "");
AddAttr<framework::BlockDesc *>(kParallelBlock, "");
AddAttr<bool>(kUseNCCL, "true if we use nccl on backward")
.SetDefault(false);
AddComment(R"DOC(
ParallelDo Operator.
)DOC");
}
};
class ParallelDoGradOp : public framework::OperatorBase {
public:
ParallelDoGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
auto *program = block->Program();
auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
->Get<std::vector<framework::Scope *>>();
auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
// feed output@grad
SplitTensorAndMoveTensorToScopes(
scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
places, Inputs(framework::GradVarName(kOutputs)));
WaitOnPlaces(places);
// exe run
std::vector<std::future<void>> workers;
for (size_t i = 0; i < sub_scopes.size(); ++i) {
auto &place = places[i];
auto *cur_scope = sub_scopes[i];
// execute
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
}
for (auto &worker : workers) {
worker.wait();
}
WaitOnPlaces(places);
// NCCL allreduce op will be added by backward,
// so no need to explicitly accumulate grad
if (!(Attr<bool>(kUseNCCL))) {
AccumulateGrad(scope, place, sub_scopes, places);
} else {
for (auto &place : places) {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"NCCL only supports cuda place");
}
}
for (auto &s : Outputs(framework::GradVarName(kParameters))) {
if (s == framework::kEmptyVarName) {
continue;
}
VLOG(3) << "Moving " << s;
CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
}
WaitOnPlaces(places);
}
void AccumulateGrad(const framework::Scope &scope,
const platform::Place &place,
const std::vector<framework::Scope *> &sub_scopes,
const platform::PlaceList &places) const {
for (auto &s : Outputs(framework::GradVarName(kParameters))) {
if (s == framework::kEmptyVarName) {
continue;
}
VLOG(3) << "Accumulating " << s;
if (s == framework::kEmptyVarName) continue;
std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
WaitOnPlaces(places);
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{{"use_mkldnn", {false}}});
VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]);
}
CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
}
WaitOnPlaces(places);
}
};
std::ostream &operator<<(std::ostream &sout,
const std::vector<std::string> &strs) {
std::copy(strs.begin(), strs.end(),
std::ostream_iterator<std::string>(sout, ","));
return sout;
}
class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
virtual std::unique_ptr<framework::OpDesc> Apply() const {
auto *grad = new framework::OpDesc();
grad->SetType("parallel_do_grad");
for (auto &input_param : this->InputNames()) {
VLOG(3) << input_param;
grad->SetInput(input_param, this->Input(input_param));
if (input_param != kPlaces) {
grad->SetOutput(framework::GradVarName(input_param),
this->InputGrad(input_param, false));
}
}
auto *g_block = this->grad_block_[0];
// All variable name that needed by gradient operators
std::unordered_set<std::string> all_inputs_in_grad_blocks;
for (size_t i = 0; i < g_block->OpSize(); ++i) {
auto *op = g_block->Op(i);
for (auto &var_name : op->InputArgumentNames()) {
all_inputs_in_grad_blocks.insert(var_name);
}
}
for (auto &output_param : this->OutputNames()) {
if (output_param == kParallelScopes) {
grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(framework::GradVarName(output_param),
this->Output(output_param));
} else {
grad->SetInput(output_param, this->Output(output_param));
std::vector<std::string> og_names;
for (auto &og_name : this->OutputGrad(output_param)) {
if (all_inputs_in_grad_blocks.count(og_name) != 0) {
// there are some gradient operators who need the OG. So make this
// OG as an input of parallel.do
og_names.push_back(og_name);
}
// else, there is no operator who need the OG. Do not use this OG as
// an input
}
grad->SetInput(framework::GradVarName(output_param), og_names);
}
}
grad->SetInput("Communicator", {"nccl_com__do_not_change_"});
grad->SetAttrMap(this->Attrs());
grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
return std::unique_ptr<framework::OpDesc>(grad);
}
};
class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs(kParameters));
PADDLE_ENFORCE(ctx->HasInputs(kInputs));
PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters));
auto i_dims = ctx->GetInputsDim(kInputs);
auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
for (size_t i = 0; i < ig_names.size(); ++i) {
auto &ig_name = ig_names[i];
if (ig_name == framework::kEmptyVarName) {
continue;
}
ctx->SetDims({ig_name}, {i_dims[i]});
}
auto p_dims = ctx->GetInputsDim(kParameters);
auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
for (size_t i = 0; i < pg_names.size(); ++i) {
auto &pg_name = pg_names[i];
if (pg_name == framework::kEmptyVarName) {
continue;
}
ctx->SetDims({pg_name}, {p_dims[i]});
}
}
};
class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
framework::BlockDesc *sub_block =
boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
for (auto &out_vars : op_desc.Outputs()) {
for (auto &out_var : out_vars.second) {
auto &var = block->FindRecursiveOrCreateVar(out_var);
auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
if (sub_var.GetType() != var.GetType()) {
var.SetType(sub_var.GetType());
}
}
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
paddle::operators::ParallelDoOpProtoMaker,
paddle::operators::ParallelDoGradOpDescMaker);
REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
paddle::operators::ParallelDoGradOpShapeInference,
paddle::operators::ParallelDoGradOpVarTypeInference);
......@@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
// At least use 32 threads, at most 512 threads.
// blockx is multiple of 32.
int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
int blockx = std::min(
static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
int gridx = (feature_width * num_priors + blockx - 1) / blockx;
dim3 threads(blockx, 1);
dim3 grids(gridx, feature_height);
......
......@@ -35,12 +35,12 @@ namespace operators {
template <typename T>
__device__ bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4;
return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
}
template <typename T>
__device__ bool LT_E(T a, T b) {
return (a < b) || fabs(a - b) < 1e-4;
return (a < b) || Eigen::numext::abs(a - b) < 1e-4;
}
template <typename T>
......
......@@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest(
while (true) {
VLOG(4) << "HandleRequest " << rpc_name << " wait next";
if (!cq->Next(&tag, &ok)) {
VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!";
break;
}
......@@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest(
// https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
// https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
if (!ok) {
LOG(WARNING) << "completion queue:" << rpc_name
<< " recv no regular event"
<< " context:" << base->Status2String(rpc_name);
VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
<< " context:" << base->Status2String(rpc_name);
TryToRegisterNewOne(rpc_name, req_id);
delete base;
continue;
......
......@@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
REGISTER_OP_CPU_KERNEL(
huber_loss,
ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
huber_loss_grad,
ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -18,9 +18,6 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
......
......@@ -15,14 +15,10 @@ limitations under the License. */
#pragma once
#include <math.h>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
......
......@@ -14,10 +14,8 @@ limitations under the License. */
#ifdef __AVX__
#include <immintrin.h>
#include "paddle/fluid/operators/math/detail/activation_functions.h"
// TODO(qingqing) refine this dependence
#include "paddle/legacy/cuda/src/avx_mathfun.h"
#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
namespace paddle {
namespace operators {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
AVX implementation of sin, cos, sincos, exp and log
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#include "paddle/fluid/platform/cpu_info.h"
/* __m128 is ugly to write */
typedef __m256 v8sf; // vector of 8 float (avx)
typedef __m256i v8si; // vector of 8 int (avx)
typedef __m128i v4si; // vector of 8 int (avx)
#define _PI32AVX_CONST(Name, Val) \
static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \
Val, Val}
_PI32AVX_CONST(1, 1);
_PI32AVX_CONST(inv1, ~1);
_PI32AVX_CONST(2, 2);
_PI32AVX_CONST(4, 4);
/* declare some AVX constants -- why can't I figure a better way to do that? */
#define _PS256_CONST(Name, Val) \
static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
Val, Val, Val, Val, Val, Val, Val, Val}
#define _PI32_CONST256(Name, Val) \
static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
Val, Val, Val, Val, Val, Val, Val, Val}
#define _PS256_CONST_TYPE(Name, Type, Val) \
static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
Val, Val, Val, Val, Val, Val, Val, Val}
_PS256_CONST(1, 1.0f);
_PS256_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
_PI32_CONST256(0, 0);
_PI32_CONST256(1, 1);
_PI32_CONST256(inv1, ~1);
_PI32_CONST256(2, 2);
_PI32_CONST256(4, 4);
_PI32_CONST256(0x7f, 0x7f);
_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
_PS256_CONST(cephes_log_q1, -2.12194440e-4);
_PS256_CONST(cephes_log_q2, 0.693359375);
#ifndef __AVX2__
typedef union imm_xmm_union {
v8si imm;
v4si xmm[2];
} imm_xmm_union;
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
{ \
imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
u.imm = imm_; \
xmm0_ = u.xmm[0]; \
xmm1_ = u.xmm[1]; \
}
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
{ \
imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
u.xmm[0] = xmm0_; \
u.xmm[1] = xmm1_; \
imm_ = u.imm; \
}
#define AVX2_BITOP_USING_SSE2(fn) \
static inline v8si avx2_mm256_##fn(v8si x, int a) { \
/* use SSE2 instruction to perform the bitop AVX2 */ \
v4si x1, x2; \
v8si ret; \
COPY_IMM_TO_XMM(x, x1, x2); \
x1 = _mm_##fn(x1, a); \
x2 = _mm_##fn(x2, a); \
COPY_XMM_TO_IMM(x1, x2, ret); \
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)
#define AVX2_INTOP_USING_SSE2(fn) \
static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \
/* use SSE2 instructions to perform the AVX2 integer operation */ \
v4si x1, x2; \
v4si y1, y2; \
v8si ret; \
COPY_IMM_TO_XMM(x, x1, x2); \
COPY_IMM_TO_XMM(y, y1, y2); \
x1 = _mm_##fn(x1, y1); \
x2 = _mm_##fn(x2, y2); \
COPY_XMM_TO_IMM(x1, x2, ret); \
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128)
AVX2_INTOP_USING_SSE2(andnot_si128)
AVX2_INTOP_USING_SSE2(cmpeq_epi32)
AVX2_INTOP_USING_SSE2(sub_epi32)
AVX2_INTOP_USING_SSE2(add_epi32)
#define avx2_mm256_and_si256 avx2_mm256_and_si128
#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
#else
#define avx2_mm256_slli_epi32 _mm256_slli_epi32
#define avx2_mm256_srli_epi32 _mm256_srli_epi32
#define avx2_mm256_and_si256 _mm256_and_si256
#define avx2_mm256_andnot_si256 _mm256_andnot_si256
#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
#define avx2_mm256_sub_epi32 _mm256_sub_epi32
#define avx2_mm256_add_epi32 _mm256_add_epi32
#endif /* __AVX2__ */
/* natural logarithm computed for 8 simultaneous float
return NaN for x <= 0
*/
v8sf log256_ps(v8sf x) {
v8si imm0;
v8sf one = *(v8sf *)_ps256_1;
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
x = _mm256_max_ps(
x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
// can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
// this is again another AVX2 instruction
imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
x = _mm256_add_ps(x, tmp);
v8sf z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_log_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
y = _mm256_add_ps(y, tmp);
tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp);
x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
_PS256_CONST(exp_hi, 88.3762626647949f);
_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS256_CONST(cephes_exp_C1, 0.693359375);
_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0;
v8sf one = *(v8sf *)_ps256_1;
x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
/* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx);
// tmp = _mm256_cvtepi32_ps(imm0);
tmp = _mm256_floor_ps(fx);
/* if greater, substract 1 */
// v8sf mask = _mm256_cmpgt_ps(tmp, fx);
v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions
imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
_PS256_CONST(minus_cephes_DP1, -0.78515625);
_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS256_CONST(sincof_p0, -1.9515295891E-4);
_PS256_CONST(sincof_p1, 8.3321608736E-3);
_PS256_CONST(sincof_p2, -1.6666654611E-1);
_PS256_CONST(coscof_p0, 2.443315711809948E-005);
_PS256_CONST(coscof_p1, -1.388731625493765E-003);
_PS256_CONST(coscof_p2, 4.166664568298827E-002);
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrisics
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
*/
v8sf sin256_ps(v8sf x) { // any x
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
v8si imm0, imm2;
#ifndef __AVX2__
v4si imm0_1, imm0_2;
v4si imm2_1, imm2_2;
#endif
sign_bit = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
/*
Here we start a series of integer operations, which are in the
realm of AVX2.
If we don't have AVX, let's perform them using SSE2 directives
*/
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
v8sf poly_mask = _mm256_castsi256_ps(imm2);
sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
x = _mm256_add_ps(x, xmm1);
x = _mm256_add_ps(x, xmm2);
x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0;
v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
y = _mm256_andnot_ps(xmm3, y);
y = _mm256_add_ps(y, y2);
/* update the sign */
y = _mm256_xor_ps(y, sign_bit);
return y;
}
/* almost the same as sin_ps */
v8sf cos256_ps(v8sf x) { // any x
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
v8si imm0, imm2;
#ifndef __AVX2__
v4si imm0_1, imm0_2;
v4si imm2_1, imm2_2;
#endif
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
y = _mm256_cvtepi32_ps(imm2);
imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
/* get the swap sign flag */
imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask */
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
v8sf sign_bit = _mm256_castsi256_ps(imm0);
v8sf poly_mask = _mm256_castsi256_ps(imm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
x = _mm256_add_ps(x, xmm1);
x = _mm256_add_ps(x, xmm2);
x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0;
v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
y = _mm256_andnot_ps(xmm3, y);
y = _mm256_add_ps(y, y2);
/* update the sign */
y = _mm256_xor_ps(y, sign_bit);
return y;
}
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
v8si imm0, imm2, imm4;
#ifndef __AVX2__
v4si imm0_1, imm0_2;
v4si imm2_1, imm2_2;
v4si imm4_1, imm4_2;
#endif
sign_bit_sin = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
#ifdef __AVX2__
/* store the integer part of y in imm2 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
y = _mm256_cvtepi32_ps(imm2);
imm4 = imm2;
/* get the swap sign flag for the sine */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_slli_epi32(imm0, 29);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm4_1 = imm2_1;
imm4_2 = imm2_2;
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
v8sf poly_mask = _mm256_castsi256_ps(imm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
x = _mm256_add_ps(x, xmm1);
x = _mm256_add_ps(x, xmm2);
x = _mm256_add_ps(x, xmm3);
#ifdef __AVX2__
imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
imm4 = avx2_mm256_slli_epi32(imm4, 29);
#else
imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
imm4_1 = _mm_slli_epi32(imm4_1, 29);
imm4_2 = _mm_slli_epi32(imm4_2, 29);
COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
#endif
v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf z = _mm256_mul_ps(x, x);
y = *(v8sf *)_ps256_coscof_p0;
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
v8sf ysin2 = _mm256_and_ps(xmm3, y2);
v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
y2 = _mm256_sub_ps(y2, ysin2);
y = _mm256_sub_ps(y, ysin1);
xmm1 = _mm256_add_ps(ysin1, ysin2);
xmm2 = _mm256_add_ps(y, y2);
/* update the sign */
*s = _mm256_xor_ps(xmm1, sign_bit_sin);
*c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
......@@ -113,26 +113,27 @@ void VXXJitCode::generate() {
ret();
}
const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
REPEAT_8TIMES(2.f),
REPEAT_8TIMES(0.5f),
REPEAT_8TIMES(EXP_HIG),
REPEAT_8TIMES(EXP_LOW),
REPEAT_8TIMES(CEPHES_LOG2EF),
REPEAT_8TIMES(CEPHES_EXP_C1),
REPEAT_8TIMES(CEPHES_EXP_C2),
REPEAT_8TIMES(CEPHES_EXP_P0),
REPEAT_8TIMES(CEPHES_EXP_P1),
REPEAT_8TIMES(CEPHES_EXP_P2),
REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P4),
REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(EXP_MAX_INPUT),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
REPEAT_8TIMES(1.f),
REPEAT_8TIMES(2.f),
REPEAT_8TIMES(0.5f),
REPEAT_8TIMES(EXP_HIG),
REPEAT_8TIMES(EXP_LOW),
REPEAT_8TIMES(CEPHES_LOG2EF),
REPEAT_8TIMES(CEPHES_EXP_C1),
REPEAT_8TIMES(CEPHES_EXP_C2),
REPEAT_8TIMES(CEPHES_EXP_P0),
REPEAT_8TIMES(CEPHES_EXP_P1),
REPEAT_8TIMES(CEPHES_EXP_P2),
REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P4),
REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(EXP_MAX_INPUT),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
int g_tmp_mem[16] ALIGN32 = {0};
const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
bool VActJitCode::init(int d, operand_type type) {
// TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
......
......@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
extern const int exp_int_0x7f[];
extern int g_tmp_mem[];
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
......
......@@ -16,9 +16,6 @@ limitations under the License. */
#include <limits>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
......@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
/* AVX instructions.*/ \
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \
__m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0); \
__m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1); \
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \
......
......@@ -13,9 +13,6 @@ limitations under the License. */
#include <limits>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
......@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
if (rest_ != 0) { \
j = offset + this->num_ - block; \
tmp = _mm256_loadu_ps((const float*)x + j); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
sum = _mm256_add_ps(sum, tmp); \
} \
hi = _mm256_extractf128_ps(sum, 1); \
......@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
j = offset + this->num_ - block; \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_mul_ps(tmp, tmp); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
sum = _mm256_add_ps(sum, tmp); \
} \
hi = _mm256_extractf128_ps(sum, 1); \
......
......@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddAttr<bool>(
"lazy_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
......
......@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
const int64_t* rows_;
int64_t row_numel_;
int64_t row_count_;
bool lazy_mode_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count)
int64_t row_numel, int64_t row_count, bool lazy_mode)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
......@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
param_out_(param_out),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
row_count_(row_count),
lazy_mode_(lazy_mode) {}
inline HOSTDEVICE void adam_update(size_t i, T g) const {
// The following code is the same as dense
T mom1 = moment1_[i];
T mom2 = moment2_[i];
......@@ -225,6 +223,17 @@ struct SparseAdamFunctor {
moment2_out_[i] = mom2;
param_out_[i] = p;
}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
if (lazy_mode_ && row_idx < 0) {
return;
} else {
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
adam_update(i, g);
}
}
};
template <typename DeviceContext, typename T>
......@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref;
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
......@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size());
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
grad_merge.rows().size(), lazy_mode);
VLOG(3) << "lazy_mode :" << lazy_mode;
if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
size_t row_count = grad_merge.rows().size();
std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = cpu_rows[row_index] * row_numel + offset;
functor.adam_update(i, grad_data[row_index * row_numel + offset]);
}
}
} else {
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
}
} else {
PADDLE_THROW("Variable type not supported by adam_op");
}
......
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op
analysis)
......@@ -21,8 +21,6 @@
namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
namespace operators {
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Xs", "A list of inputs.").AsDuplicable();
AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
AddAttr<int>("max_batch_size", "the maximum batch size.");
AddAttr<int>("workspace_size", "the workspace size.");
AddComment("TensorRT engine operator.");
......@@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
namespace ops = paddle::operators;
REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
ops::TensorRTEngineOpMaker);
#endif // PADDLE_WITH_CUDA
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
tensorrt_engine,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -27,8 +27,6 @@
namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size);
namespace operators {
using FluidDT = framework::proto::VarType_Type;
......@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
......@@ -63,128 +61,119 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
} // namespace // NOLINT
using inference::Singleton;
using inference::tensorrt::TRT_EngineManager;
using inference::tensorrt::TensorRTEngine;
class TensorRTEngineOp : public framework::OperatorBase {
private:
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
mutable std::unique_ptr<TensorRTEngine> trt_engine_;
int max_batch_size_;
int workspace_size_;
class TensorRTEngineOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
TensorRTEngineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {
input_names_ = Inputs("Xs");
max_batch_size_ = Attr<int>("max_batch_size");
workspace_size_ = Attr<int>("workspace_size");
auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) {
param_names_.insert(param);
}
}
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input0 = ctx.Inputs("Xs").front();
framework::OpKernelType kt = framework::OpKernelType(
ctx.scope().FindVar(input0)->GetMutable<framework::LoDTensor>()->type(),
ctx.GetPlace());
return kt;
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
RunTrt(scope, dev_place);
}
};
template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto engine_name = context.Attr<std::string>("engine_uniq_key");
int max_batch_size = context.Attr<int>("max_batch_size");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context);
void RunTrt(const framework::Scope &scope,
const platform::Place &dev_place) const {
int runtime_batch = 1;
if (trt_engine_.get() == nullptr) {
trt_engine_.reset(new TensorRTEngine(
max_batch_size_, workspace_size_, nullptr,
boost::get<platform::CUDAPlace>(dev_place).device));
Prepare(scope, dev_place, trt_engine_.get());
}
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
auto *engine = trt_engine_.get();
PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
Attr<std::vector<std::string>>("output_name_mapping");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
parameters.insert(param);
}
// Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) {
if (parameters.count(x)) continue;
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
// convert input and copy to TRT engine's buffer
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
context.scope(), x);
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
auto t_shape = framework::vectorize(t.dims());
runtime_batch = t_shape[0];
if (platform::is_cpu_place(t.place())) {
engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
t.memory_size());
} else {
engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
t.memory_size());
}
}
PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
// Execute the engine.
PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
engine->Execute(FLAGS_tensorrt_engine_batch_size);
engine->Execute(runtime_batch);
// Convert output tensor from engine to fluid
int output_index = 0;
VLOG(4) << "TensorRT Engine Op Outputs:";
for (const auto& y : context.Outputs("Ys")) {
for (const auto &y : Outputs("Ys")) {
VLOG(4) << y;
// convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
auto dims = trt_t->getDimensions();
// Use the output ITensor's dims to reshape the Fluid Tensor.
// The ITensor doesn't contain the batch size dim.
std::vector<int> ddim;
ddim.push_back(FLAGS_tensorrt_engine_batch_size);
ddim.push_back(runtime_batch);
for (int i = 0; i < dims.nbDims; i++) {
ddim.push_back(dims.d[i]);
}
auto* fluid_v = context.scope().FindVar(y);
auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
FLAGS_tensorrt_engine_batch_size;
auto size =
inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
engine->GetOutputInGPU(
output_maps[output_index],
fluid_t->mutable_data<float>(platform::CUDAPlace(
boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
boost::get<platform::CUDAPlace>(dev_place).device)),
size * sizeof(float));
output_index += 1;
}
cudaStreamSynchronize(*engine->stream());
}
protected:
void Prepare(const framework::ExecutionContext& context) const {
void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
TensorRTEngine *engine) const {
VLOG(4) << "Prepare engine";
// Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
int max_batch_size = context.Attr<int>("max_batch_size");
int workspace_size = context.Attr<int>("workspace_size");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
parameters.insert(param);
}
block_desc.ParseFromString(Attr<std::string>("subgraph"));
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
// TODO(Superjomn) replace this with a different stream
auto* engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"),
boost::get<platform::CUDAPlace>(context.GetPlace()).device);
Attr<std::vector<std::string>>("output_name_mapping");
engine->InitNetwork();
......@@ -192,39 +181,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
for (auto& input : context.Inputs("Xs")) {
if (parameters.count(input)) continue;
for (auto &input : Inputs("Xs")) {
if (param_names_.count(input)) continue;
VLOG(4) << "declare input " << input;
auto* var = block.FindVar(input);
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(shape));
Vec2TRT_Dims(t_shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlock(block_desc, parameters, context.scope(), engine);
.ConvertBlock(block_desc, param_names_, scope, engine);
// Add outputs
for (auto& output : output_maps) {
if (!engine->HasDeclared(output)) {
engine->DeclareOutput(output);
}
for (auto &output : output_maps) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
};
......
......@@ -24,8 +24,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_CUDA_ONLY_OP(tensorrt_engine);
USE_NO_KERNEL_OP(tensorrt_engine);
namespace paddle {
namespace operators {
......
......@@ -67,6 +67,13 @@ ENDIF()
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
if(WIN32)
if(WITH_GPU AND NOT WITH_DSO)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
target_link_libraries(device_context ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
endif(WIN32)
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
cc_test(init_test SRCS init_test.cc DEPS device_context)
......
......@@ -16,6 +16,26 @@ limitations under the License. */
#include <stddef.h>
#ifdef _WIN32
#if defined(__AVX2__)
#include <immintrin.h> //avx2
#elif defined(__AVX__)
#include <intrin.h> //avx
#endif // AVX
#else // WIN32
#ifdef __AVX__
#include <immintrin.h>
#endif
#endif // WIN32
#if defined(_WIN32)
#define ALIGN32_BEG __declspec(align(32))
#define ALIGN32_END
#else
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
namespace paddle {
namespace platform {
......
......@@ -19,7 +19,7 @@ namespace paddle {
namespace platform {
#if CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void *user_data);
static void CUDART_CB StreamCallbackFunc(void *user_data)
#else
static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
cudaError_t status, void *user_data)
......
......@@ -19,10 +19,6 @@ if(WITH_PYTHON)
endif(WITH_AMD_GPU)
if(WIN32)
if(WITH_GPU AND NOT WITH_DSO)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
target_link_libraries(paddle_pybind ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_pybind shlwapi)
endif(WIN32)
......
......@@ -41,6 +41,23 @@ namespace pd = paddle::framework;
namespace paddle {
namespace pybind {
using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
#ifdef PADDLE_WITH_PSLIB
void BindAsyncExecutor(py::module* m) {
py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
.def(py::init([](framework::Scope* scope, const platform::Place& place) {
return std::unique_ptr<framework::AsyncExecutor>(
new framework::AsyncExecutor(scope, place));
}))
.def("run_from_files", &framework::AsyncExecutor::RunFromFile)
.def("init_server", &framework::AsyncExecutor::InitServer)
.def("init_worker", &framework::AsyncExecutor::InitWorker)
.def("start_server", &framework::AsyncExecutor::StartServer)
.def("stop_server", &framework::AsyncExecutor::StopServer)
.def("gather_servers", &framework::AsyncExecutor::GatherServers)
.def("init_model", &framework::AsyncExecutor::InitModel)
.def("save_model", &framework::AsyncExecutor::SaveModel);
} // end BindAsyncExecutor
#else
void BindAsyncExecutor(py::module* m) {
py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
.def(py::init([](framework::Scope* scope, const platform::Place& place) {
......@@ -49,5 +66,6 @@ void BindAsyncExecutor(py::module* m) {
}))
.def("run_from_files", &framework::AsyncExecutor::RunFromFile);
} // end BindAsyncExecutor
#endif
} // end namespace pybind
} // end namespace paddle
......@@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default False)DOC")
.def_property(
"memory_optimize",
[](const BuildStrategy &self) { return self.memory_optimize_; },
[](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
.def_property(
"memory_early_delete",
[](const BuildStrategy &self) { return self.memory_early_delete_; },
[](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
.def("_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(true);
......
......@@ -150,7 +150,7 @@ def __bootstrap__():
read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'selected_gpus'
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus'
]
core.init_gflags([sys.argv[0]] +
......
......@@ -24,6 +24,8 @@ from paddle.fluid.proto import data_feed_pb2
from google.protobuf import text_format
from . import io
from .data_feed_desc import DataFeedDesc
from .distributed import ps_instance
from .contrib.utils import hdfs_utils as hdfs
__all__ = ['AsyncExecutor']
......@@ -74,7 +76,7 @@ class AsyncExecutor(object):
Note: Only running on CPUPlace supported.
"""
def __init__(self, place=None):
def __init__(self, place=None, run_mode=""):
if place is None:
place = core.CPUPlace()
if not isinstance(place, core.CPUPlace):
......@@ -85,8 +87,16 @@ class AsyncExecutor(object):
scope = global_scope()
self.executor = core.AsyncExecutor(scope, p)
self.instance = None
def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
def run(self,
program,
data_feed,
filelist,
thread_num,
fetch,
mode="",
debug=False):
"""
Run program by this AsyncExecutor. Training dataset will be in filelist.
Users can also inspect certain variables by naming them in parameter
......@@ -106,6 +116,7 @@ class AsyncExecutor(object):
thread_num(int): number of concurrent training threads. See
:code:`Note` for how to set this properly
fetch(str|list): the var name or a list of var names to inspect
mode(str): run mode of this interface
debug(bool): When set to True, fetch vars will be printed to
standard output after each minibatch
......@@ -148,4 +159,152 @@ class AsyncExecutor(object):
self.executor.run_from_files(program_desc,
data_feed.desc(), filelist, thread_num,
fetch_var_names, debug)
fetch_var_names, mode, debug)
def download_data(self,
afs_path,
local_path,
fs_default_name,
ugi,
file_cnt,
hadoop_home="$HADOOP_HOME",
process_num=12):
"""
download_data is a default download method for distributed training
a user download data without this method
Example:
>>> exe = fluid.AsyncExecutor()
>>> exe.download_data("/xxx/xxx/xx/",
>>> "./data", "afs://
>>> xxx.xxx.xxx.xxx:9901", "xxx,yyy")
Args:
afs_path(str): afs_path defined by users
local_path(str): download data path
fs_default_name(str): file system server address
ugi(str): hadoop ugi
file_cn(int): a user can specify file number for debugging
hadoop_home(str): hadoop home path
process_num(int): download process num
"""
if self.instance is None:
raise ValueError('instance is None, please run'
'config_distributed_nodes init instance')
configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi}
client = hdfs.HDFSClient(hadoop_home, configs)
downloads = hdfs.multi_download(
client,
afs_path,
local_path,
self.instance.get_worker_index(),
self.instance.get_node_cnt() / 2,
file_cnt,
multi_processes=process_num)
self.instance.barrier_worker() #wait for download_data
def get_instance(self):
"""
get current node's instance so that user can do operations
in distributed setting
"""
if self.instance is None:
raise ValueError(
'instance is None, please run config_distributed_nodes init instance'
)
return self.instance
def config_distributed_nodes(self):
"""
if a user needs to run distributed async executor
he or she needs to do a global configuration so that
information of current process can be obtained
"""
self.instance = ps_instance.PaddlePSInstance(1, 2)
return self.instance
def stop(self):
"""
at the end of process, users should call stop to servers
and barrier all workers
"""
if self.instance is None:
raise ValueError(
'instance is None, please run config_distributed_nodes init instance'
)
self.instance.barrier_worker() #worker do all things
if self.instance.is_first_worker():
self.executor.stop_server()
self.instance.barrier_worker() #sync
self.instance.barrier_all()
self.instance.finalize()
def init_server(self, dist_desc):
"""
initialize server of current node if current process is a server
Args:
dist_desc(str): a protobuf string that describes
how to init a worker and a server
"""
if self.instance is None:
raise ValueError(
'instance is None, please run config_distributed_nodes init instance'
)
self.executor.init_server(dist_desc, self.instance._rankid)
ip = self.executor.start_server()
self.instance.set_ip(ip)
self.instance.barrier_all() #wait all server start
ips = self.instance.gather_ips()
self.executor.gather_servers(ips, self.instance.get_node_cnt())
self.instance.barrier_all() #wait all worker start
def init_worker(self, dist_desc, startup_program):
"""
initialize worker of current node if current process is a worker
Args:
dist_desc(str): a protobuf string that describes
how to init a worker and a server
startup_program(fluid.Program): startup program of current process
"""
if self.instance is None:
raise ValueError(
'instance is None, please run config_distributed_nodes init instance'
)
place = core.CPUPlace()
executor = Executor(place)
executor.run(startup_program)
self.instance.barrier_all() #wait all server start
ips = self.instance.gather_ips()
self.executor.init_worker(dist_desc, ips,
self.instance.get_node_cnt(),
self.instance._rankid)
self.instance.barrier_all() #wait all worker start
if self.instance.is_first_worker():
self.executor.init_model()
self.instance.barrier_worker() #wait init model
def init_model(self):
"""
init_model command that can be invoked from one of the worker
model parameters are initialized in servers
"""
if self.instance is None:
raise ValueError(
'instance is None, please run config_distributed_nodes init instance'
)
self.executor.init_model()
def save_model(self, save_path):
"""
save_model command that can be invoked from one of the worker
model parameters are saved in servers and upload to save_path of file system
Args:
save_path(str): save path to file system
"""
if self.instance is None:
raise ValueError(
'instance is None, please run config_distributed_nodes init instance'
)
self.executor.save_model(save_path)
......@@ -249,69 +249,6 @@ def serialize_op_decs(op_desc):
return proto.__str__()
def _callback_lookup_(op):
"""
Only used in _append_backward_ops_
Build and returns a callback function for certain op. For example
parallel_do: AllReduce
:param op:
:return: callback function
"""
if op.type == 'parallel_do' and op.attr('use_nccl'):
all_vars = op.block.vars
param_names = set(op.input('parameters'))
param_names = [
name for name in param_names
if all_vars[name].stop_gradient is False
]
param_grad_names = [n + "@GRAD" for n in param_names]
class ParallelDoCallBack(object):
def __init__(self, param_grad_names, parallel_scopes_name):
self.has_inserted_nccl_init = False
self.param_grad_names = param_grad_names
self.parallel_scopes_name = parallel_scopes_name
def __call__(self, block, context):
if not self.has_inserted_nccl_init:
op_desc = _create_op_desc_(
"ncclInit",
{"parallel_scopes": self.parallel_scopes_name},
{"Communicator": ['nccl_com__do_not_change_']}, {})
block.program.global_block().desc.append_op().copy_from(
op_desc)
self.has_inserted_nccl_init = True
current_op_desc = context["__current_op_desc__"]
for o_param in current_op_desc.output_names():
for o_argu in current_op_desc.output(o_param):
if o_argu in self.param_grad_names:
allreduce_out_name = o_argu + "__nccl_all_reduce__"
op_desc = _create_op_desc_(
"ncclReduce",
{
"X": [o_argu],
"Communicator":
['nccl_com__do_not_change_']
},
{"Out": [allreduce_out_name]},
{"reduction": "ncclSum",
"root": 0}, )
block.desc.append_op().copy_from(op_desc)
op_desc = _create_op_desc_(
"assign", {"X": [allreduce_out_name]},
{"Out": [o_argu]}, {})
block.desc.append_op().copy_from(op_desc)
return ParallelDoCallBack(param_grad_names,
op.output("parallel_scopes"))
else:
return None
def _append_backward_ops_(block,
ops,
target_block,
......@@ -349,17 +286,8 @@ def _append_backward_ops_(block,
sub_block = program.block(op._block_attr_id("sub_block"))
grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op)
if cb is not None:
if callbacks is None:
new_callbacks = [cb]
else:
new_callbacks = callbacks + [_callback_lookup_(op)]
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, new_callbacks)
else:
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callbacks)
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callbacks)
program._rollback()
grad_sub_block_list.append(grad_sub_block.desc)
......@@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
# infer_shape and infer_type
op_desc.infer_var_type(block.desc)
op_desc.infer_shape(block.desc)
# ncclInit dones't need to set data_type
if op_desc.type() == 'ncclInit':
continue
for arg in op_desc.output_arg_names():
if arg in new_vars:
_infer_var_data_type_(arg, block)
......
......@@ -13,10 +13,10 @@
# limitations under the License.
from __future__ import print_function
from . import lookup_table_utils
from .lookup_table_utils import *
#from . import lookup_table_utils
#from .lookup_table_utils import *
from . import hdfs_utils
from .hdfs_utils import *
__all__ = lookup_table_utils.__all__
#__all__ = lookup_table_utils.__all__
__all__ = hdfs_utils.__all__
......@@ -32,6 +32,28 @@ _logger.setLevel(logging.INFO)
class HDFSClient(object):
"""
A tool of HDFS
Args:
hadoop_home (string): hadoop_home
configs (dict): hadoop config, it is a dict, please contain \
key "fs.default.name" and "hadoop.job.ugi"
Can be a float value
Examples:
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
"fs.default.name": "hdfs://xxx.hadoop.com:54310",
"hadoop.job.ugi": "hello,hello123"
}
client = HDFSClient(hadoop_home, configs)
client.ls("/user/com/train-25")
files = client.lsr("/user/com/train-25/models")
"""
def __init__(self, hadoop_home, configs):
self.pre_commands = []
hadoop_bin = '%s/bin/hadoop' % hadoop_home
......@@ -52,9 +74,13 @@ class HDFSClient(object):
ret_code = 0
ret_out = None
ret_err = None
whole_commands = " ".join(whole_commands)
for x in range(retry_times + 1):
proc = subprocess.Popen(
whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
whole_commands,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
(output, errors) = proc.communicate()
ret_code, ret_out, ret_err = proc.returncode, output, errors
if ret_code:
......@@ -68,10 +94,12 @@ class HDFSClient(object):
def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
"""
upload the local file to hdfs
args:
local_file_path: the local file path
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
return:
Args:
hdfs_path: hdfs path, target path
local_path: local file path, source path
overwrite: will overwrite the original file
retry_times: max times retry to upload
Returns:
True or False
"""
assert hdfs_path is not None
......@@ -114,10 +142,12 @@ class HDFSClient(object):
def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
"""
download from hdfs
args:
local_file_path: the local file path
remote_file_path: remote dir on hdfs
return:
Args:
hdfs_path: hdfs path, target path
local_path: local file path, source path
overwrite: will remove original file and overwrite it.
unzip: ignore this param
Returns
True or False
"""
_logger.info('Downloading %r to %r.', hdfs_path, local_path)
......@@ -159,11 +189,11 @@ class HDFSClient(object):
def is_exist(self, hdfs_path=None):
"""
whether the remote hdfs path exists?
args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
Args:
hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
return:
Returns:
True or False
"""
exist_cmd = ['-test', '-e', hdfs_path]
......@@ -182,11 +212,11 @@ class HDFSClient(object):
def is_dir(self, hdfs_path=None):
"""
whether the remote hdfs path exists?
args:
Args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
return:
Returns:
True or False
"""
......@@ -206,15 +236,17 @@ class HDFSClient(object):
return True
def delete(self, hdfs_path):
"""Remove a file or directory from HDFS.
:param hdfs_path: HDFS path.
:param recursive: Recursively delete files and directories. By default,
this method will raise an :class:`HdfsError` if trying to delete a
non-empty directory.
"""
Remove a file or directory from HDFS.
This function returns `True` if the deletion was successful and `False` if
no file or directory previously existed at `hdfs_path`.
Args:
param hdfs_path: HDFS path.
param recursive: Recursively delete files and directories. By default,
this method will raise an :class:`HdfsError` if trying to delete a
non-empty directory.
Returns:
This function returns `True` if the deletion was successful and `False` if
no file or directory previously existed at `hdfs_path`.
"""
_logger.info('Deleting %r.', hdfs_path)
......@@ -240,14 +272,17 @@ class HDFSClient(object):
return True
def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
"""Move a file or folder.
:param hdfs_src_path: Source path.
:param hdfs_dst_path: Destination path. If the path already exists and is
a directory, the source will be moved into it. If the path exists and is
a file, or if a parent destination directory is missing, this method will
raise an :class:`HdfsError`.
"""
Rename a file or folder.
Args:
:param hdfs_src_path: Source path.
:param hdfs_dst_path: Destination path. If the path already exists and is
a directory, the source will be moved into it. If the path exists and is
a file, or if a parent destination directory is missing, this method will
raise an :class:`HdfsError`.
Returns:
This function returns `True` if the rename was successful and `False` if
rename was faild.
"""
assert hdfs_src_path is not None
assert hdfs_dst_path is not None
......@@ -273,6 +308,11 @@ class HDFSClient(object):
@staticmethod
def make_local_dirs(local_path):
"""
create a directiory local, is same to mkdir
Args:
local_path: local path that wants to create a directiory.
"""
try:
os.makedirs(local_path)
except OSError as e:
......@@ -281,9 +321,11 @@ class HDFSClient(object):
def makedirs(self, hdfs_path):
"""Create a remote directory, recursively if necessary.
:param hdfs_path: Remote path. Intermediate directories will be created
appropriately.
Args:
:param hdfs_path: Remote path. Intermediate directories will be created
appropriately.
Returns:
True if make a directories was successful, False when make a directiries was failed.
"""
_logger.info('Creating directories to %r.', hdfs_path)
assert hdfs_path is not None
......@@ -303,6 +345,13 @@ class HDFSClient(object):
return True
def ls(self, hdfs_path):
"""
ls a hdfs_path.
Args:
:param hdfs_path: hdfs_path will be ls.
Returns:
This function returns a `list` that contaion all files in the hdfs_path.
"""
assert hdfs_path is not None
if not self.is_exist(hdfs_path):
......@@ -328,6 +377,14 @@ class HDFSClient(object):
return ret_lines
def lsr(self, hdfs_path, only_file=True, sort=True):
"""
ls a hdfs_path sort by time.
Args:
:param hdfs_path: hdfs_path will be ls.
Returns:
This function returns a `list` that contaion all files sorted by time in the hdfs_path.
"""
def sort_by_time(v1, v2):
v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
......@@ -371,12 +428,15 @@ def multi_upload(client,
multi_processes=5,
overwrite=False):
"""
:param overwrite: will overwrite hdfs file or not
:param multi_processes: the upload data process at the same time, default=5
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:return:
Upload file to hdfs.
Args:
:param overwrite: will overwrite hdfs file or not
:param multi_processes: the upload data process at the same time, default=5
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
Returns:
"""
def __subprocess_upload(datas):
......@@ -386,6 +446,13 @@ def multi_upload(client,
client.upload(hdfs_re_path, data, overwrite, retry_times=5)
def get_local_files(path):
"""
Get all local files
Args:
path: local file path
Returns:
A list that contation all files in the path.
"""
rlist = []
if not os.path.isdir(path):
......@@ -426,16 +493,21 @@ def multi_download(client,
local_path,
trainer_id,
trainers,
file_cnt,
multi_processes=5):
"""
multi_download
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:param trainer_id: current trainer id
:param trainers: all trainers number
:param multi_processes: the download data process at the same time, default=5
:return: None
Args:
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:param trainer_id: current trainer id
:param trainers: all trainers number
:param file_cnt: all file number
:param multi_processes: the download data process at the same time, default=5
:return: None
Returns:
A list that be downloaded.
"""
def __subprocess_download(datas):
......@@ -449,7 +521,7 @@ def multi_download(client,
client.make_local_dirs(local_path)
_logger.info("Make local dir {} successfully".format(local_path))
all_need_download = client.lsr(hdfs_path, sort=True)
all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt]
need_download = all_need_download[trainer_id::trainers]
_logger.info("Get {} files From all {} files need to be download from {}".
format(len(need_download), len(all_need_download), hdfs_path))
......@@ -500,6 +572,7 @@ if __name__ == "__main__":
"/home/xx/data1",
1,
5,
100,
multi_processes=5)
multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
......@@ -15,12 +15,52 @@
LOOKUP_TABLE_TYPE = "lookup_table"
def find_distributed_lookup_table_inputs(program, table_name):
"""
Find input variable of distribute lookup table in program.
We only support one distribute table now.
Args:
program(Program): given program, locate distributed lookup table
table_name(str): given table name that is found beforehand
Returns:
inputs
"""
local_vars = program.current_block().vars
inputs = []
for op in program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if table_name == op.input("W")[0]:
inputs.extend([local_vars[name] for name in op.input("Ids")])
return inputs
def find_distributed_lookup_table_outputs(program, table_name):
"""
Find output variable of distribute lookup table in program.
We only support one distribute table now.
Args:
program(Program): given program, locate distributed lookup table
table_name(str): given table name that is found beforehand
Returns:
outputs
"""
local_vars = program.current_block().vars
outputs = []
for op in program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if table_name == op.input("W")[0]:
outputs.extend([local_vars[name] for name in op.output("Out")])
return outputs
def find_distributed_lookup_table(program):
"""
Find distribute lookup table in program.
We only support one distribute table now.
:param program:
:return: table_name or None
Args:
program(Program): given program, locate distributed lookup table
Returns:
table_name or None
"""
table_name = None
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from .node import DownpourServer
from .node import DownpourWorker
from ..backward import append_backward
import ps_pb2 as pslib
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
from google.protobuf import text_format
class DownpourSGD(object):
"""
Distributed optimizer of downpour stochastic gradient descent
Standard implementation of Google's Downpour SGD
in Large Scale Distributed Deep Networks
Args:
learning_rate (float): the learning rate used to update parameters. \
Can be a float value
Examples:
.. code-block:: python
downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2)
downpour_sgd.minimize(cost)
"""
def __init__(self, learning_rate=0.001, window=1):
# todo(guru4elephant): add more optimizers here as argument
# todo(guru4elephant): make learning_rate as a variable
self.learning_rate_ = learning_rate
self.window_ = window
self.type = "downpour"
def minimize(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
"""
DownpounSGD is a distributed optimizer so
that user can call minimize to generate backward
operators and optimization operators within minmize function
Args:
loss(Variable): loss variable defined by user
startup_program(Program): startup program that defined by user
parameter_list(str list): parameter names defined by users
no_grad_set(set): a set of variables that is defined by users
so that these variables do not need gradient computation
Returns:
[ps_param, worker_skipped_ops]
ps_param: parameter server protobuf desc
worker_skipped_ops: operator names that need
to be skipped during execution
"""
params_grads = sorted(
append_backward(loss, parameter_list, no_grad_set),
key=lambda x: x[0].name)
table_name = find_distributed_lookup_table(loss.block.program)
prefetch_slots = find_distributed_lookup_table_inputs(
loss.block.program, table_name)
prefetch_slots_emb = find_distributed_lookup_table_outputs(
loss.block.program, table_name)
server = DownpourServer()
# window is communication strategy
worker = DownpourWorker(self.window_)
# Todo(guru4elephant): support multiple tables definitions
# currently support one big sparse table
sparse_table_index = 0
# currently merge all dense parameters into one dense table
dense_table_index = 1
params = []
grads = []
for i in params_grads:
params.append(i[0])
for i in params_grads:
grads.append(i[1])
server.add_sparse_table(sparse_table_index, self.learning_rate_,
prefetch_slots, prefetch_slots_emb)
server.add_dense_table(dense_table_index, self.learning_rate_, params,
grads)
worker.add_sparse_table(sparse_table_index, self.learning_rate_,
prefetch_slots, prefetch_slots_emb)
worker.add_dense_table(dense_table_index, self.learning_rate_, params,
grads)
ps_param = pslib.PSParameter()
ps_param.server_param.CopyFrom(server.get_desc())
ps_param.trainer_param.CopyFrom(worker.get_desc())
# Todo(guru4elephant): figure out how to support more sparse parameters
# currently only support lookup_table
worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
ps_param.trainer_param.skip_op.extend(worker_skipped_ops)
return [ps_param, worker_skipped_ops]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class FileSystem(object):
"""
A file system that support async_executor hadoop client desc.
Args:
fs_type (string): fs_type, for example is "afs"
user (string): hadoop param
passwd (string): hadoop param
hadoop bin (string): hadoop param
Examples:
fs = FileSystm()
"""
def __init__(self,
fs_type="afs",
uri="afs://xx",
user=None,
passwd=None,
hadoop_bin=""):
assert user != None
assert passwd != None
assert hadoop_bin != None
import ps_pb2 as pslib
self.fs_client = pslib.FsClientParameter()
self.fs_client.uri = uri
self.fs_client.user = user
self.fs_client.passwd = passwd
#self.fs_client.buffer_size = 0
self.fs_client.hadoop_bin = hadoop_bin
#self.fs_client.afs_conf = afs_conf if not afs_conf else ""
def get_desc(self):
"""
get hadoop desc.
"""
return self.fs_client
class MPIHelper(object):
"""
MPIHelper is a wrapper of mpi4py, support get_rank get_size etc.
Args:
No params
Examples:
mh = MPIHelper()
mh.get_ip()
"""
def __init__(self):
from mpi4py import MPI
self.comm = MPI.COMM_WORLD
self.MPI = MPI
def get_rank(self):
return self.comm.Get_rank()
def get_size(self):
return self.comm.Get_size()
def get_ip(self):
import socket
local_ip = socket.gethostbyname(socket.gethostname())
return local_ip
def get_hostname(self):
import socket
return socket.gethostname()
def finalize(self):
self.MPI.Finalize()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import ps_pb2 as pslib
class Server(object):
"""
A Server basic class.
"""
def __init__(self):
pass
class Worker(object):
"""
A Worker basic class.
"""
def __init__(self):
pass
class DownpourServer(Server):
"""
DownpourServer class is used to generate server program_desc
Args:
server: it is pslib.ServerParameter()
Examples:
server = DownpourServer()
"""
def __init__(self):
self.server_ = pslib.ServerParameter()
self.server_.downpour_server_param.service_param.start_server_port = 0
self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
self.server_.downpour_server_param.service_param.service_class = "DownpourPsService"
self.server_.downpour_server_param.service_param.start_server_port = 0
self.server_.downpour_server_param.service_param.server_thread_num = 12
def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
slot_value_var):
"""
Args:
table_id(int): id of sparse params table
learning_rate(float): the learning rate used to update parameters. \
Can be a float value
slot_key_vars(string): slot key id
slot_value_var(string): slot key value after embedding
Returns:
return None
"""
table = self.server_.downpour_server_param.downpour_table_param.add()
table.table_id = table_id
table.table_class = "DownpourSparseTable"
table.type = pslib.PS_SPARSE_TABLE
table.accessor.accessor_class = "DownpourFeatureValueAccessor"
table.accessor.sparse_sgd_param.learning_rate = learning_rate
table.accessor.sparse_sgd_param.initial_g2sum = 3
table.accessor.sparse_sgd_param.initial_range = 1e-4
table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10])
table.accessor.embedx_dim = 8
table.accessor.embedx_threshold = 5
table.accessor.fea_dim = 11
table.accessor.downpour_accessor_param.nonclk_coeff = 0.1
table.accessor.downpour_accessor_param.click_coeff = 2
table.accessor.downpour_accessor_param.base_threshold = 0.2
table.accessor.downpour_accessor_param.delta_threshold = 0.15
table.accessor.downpour_accessor_param.delta_keep_days = 31
table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999
table.accessor.downpour_accessor_param.delete_threshold = 0.8
def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
"""
Args:
table_id(int): id of sparse params table
learning_rate(float): the learning rate used to update parameters. \
Can be a float value
param_var(list): all dense param. it is a list.
grad_var(list): all dense grad parm it is a list.
Returns:
return None
"""
table = self.server_.downpour_server_param.downpour_table_param.add()
table.table_id = table_id
table.table_class = "DownpourDenseTable"
table.type = pslib.PS_DENSE_TABLE
table.accessor.accessor_class = "DownpourDenseValueAccessor"
table.accessor.dense_sgd_param.name = "adam"
table.accessor.dense_sgd_param.adam.learning_rate = learning_rate
table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993
table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999
table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8
table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99
table.accessor.dense_sgd_param.naive.learning_rate = 0.0002
fea_dim = 0
for param in filter(lambda x: x.name.find("embedding") == -1,
param_var):
fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
table.accessor.fea_dim = fea_dim
def get_desc(self):
"""
Return downpour server program_desc
"""
return self.server_
class DownpourWorker(Worker):
"""
DownpourWorker class is used to generate worker program_desc
Args:
window (int): push params frequency
worker: it is pslib.DownpourTrainerParameter
Examples:
worker = DownpourWorker(1)
"""
def __init__(self, window):
self.window = window
self.worker_ = pslib.DownpourTrainerParameter()
def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
slot_value_vars):
"""
Args:
table_id(int): id of sparse params table
learning_rate(float): the learning rate used to update parameters. \
Can be a float value
slot_key_vars(string): slot key id
slot_value_var(string): slot key value after embedding
Returns:
return None
"""
table = self.worker_.sparse_table.add()
table.table_id = table_id
table.slot_key.extend([var.name for var in slot_key_vars])
table.slot_value.extend([var.name for var in slot_value_vars])
table.slot_gradient.extend(
[var.name + "@GRAD" for var in slot_value_vars])
def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
"""
Args:
table_id(int): id of sparse params table
learning_rate(float): the learning rate used to update parameters. \
Can be a float value
param_var(list): all dense param. it is a list.
grad_var(list): all dense grad parm it is a list.
Returns:
return None
"""
table = self.worker_.dense_table.add()
table.table_id = table_id
table.dense_variable_name.extend(
filter(lambda x: x.find("embedding") == -1,
[p.name for p in param_vars]))
table.dense_gradient_variable_name.extend(
filter(lambda x: x.find("embedding") == -1,
[g.name for g in grad_vars]))
def get_desc(self):
"""
Return downpour worker program_desc
"""
return self.worker_
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from .helper import MPIHelper
class PaddlePSInstance(object):
"""
PaddlePSInstance class is used to generate A instance of server or worker
Args:
server_worker_mode: is a value 0 or 1, default is 1
proc_per_node: process per node, default is 2
Examples:
instance = PaddlePSInstance(1, 2)
"""
def __init__(self, server_worker_mode, proc_per_node):
self.dh = MPIHelper()
self._rankid = self.dh.get_rank()
self._server_worker_mode = server_worker_mode
self._proc_per_node = proc_per_node
self._nodes = self.dh.get_size()
self._ip = 0
self._worker_num = self._nodes * self._proc_per_node / 2
self._server_num = self._nodes * self._proc_per_node / 2
self._total_server_worker = self._worker_num + self._server_num
self._node_type = None #IDLE=-1, WORKER=1, SERVER=0
self._set_nodetype()
self._comm = None
self._split_comm()
def _set_nodetype(self):
if self._server_worker_mode == 0:
if self._rankid < self._server_num:
self._node_type = 1
elif self._rankid < self._total_server_worker:
self._node_type = 0
else:
self._node_type = -1
elif self._server_worker_mode == 1:
if self._rankid < self._total_server_worker:
if 0 == self._rankid % self._proc_per_node % 2:
self._node_type = 0
else:
self._node_type = 1
else:
self._node_type = -1
else:
self._node_type = -1
def _split_comm(self):
if self.is_server():
self._comm = self.dh.comm.Split(self._node_type)
elif self.is_worker():
self._comm = self.dh.comm.Split(self._node_type)
pass
def get_worker_index(self):
"""
Return worker index
"""
if self._server_worker_mode == 0:
return self._rankid == self.server_num
else:
return self._rankid / self._proc_per_node
def get_server_index(self):
"""
Return server index
"""
if self._server_worker_mode == 0:
return self.rank_id
else:
return self.rank_id / self._proc_per_node
def is_worker(self):
"""
Return instance is worker or not
"""
return self._node_type == 1
def is_server(self):
"""
Return instance is server or not
"""
return self._node_type == 0
def is_first_worker(self):
"""
Return instance is first worker or not
"""
return self.is_worker() and 0 == self.get_worker_index()
def set_ip(self, ip):
"""
set server ip
"""
self._ip = ip
def gather_ips(self):
"""
Return all servers and workers ip throught mpi allgather
"""
self._ips = self.dh.comm.allgather(self._ip)
return self._ips
def get_node_cnt(self):
"""
Return node cnt
"""
return self._nodes
def barrier_all(self):
"""
barrier workers and servers
"""
self.dh.comm.barrier()
def barrier_worker(self):
"""
barrier workers
"""
if self.is_worker():
self._comm.barrier()
pass
def finalize(self):
"""
MPI finalize
"""
self.dh.finalize()
pass
if __name__ == "__main__":
instance = PaddlePSInstance(1, 1, 2, 50)
instance.barrier_all()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: ps.proto
import sys
_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1'))
from google.protobuf.internal import enum_type_wrapper
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='ps.proto',
package='paddle',
syntax='proto2',
serialized_pb=_b(
'\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
))
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
_TABLETYPE = _descriptor.EnumDescriptor(
name='TableType',
full_name='paddle.TableType',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None),
_descriptor.EnumValueDescriptor(
name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None),
],
containing_type=None,
options=None,
serialized_start=3286,
serialized_end=3338, )
_sym_db.RegisterEnumDescriptor(_TABLETYPE)
TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
_PSCMDID = _descriptor.EnumDescriptor(
name='PsCmdID',
full_name='paddle.PsCmdID',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='PS_PULL_DENSE_TABLE',
index=0,
number=0,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_PUSH_DENSE_TABLE',
index=1,
number=1,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_PULL_SPARSE_TABLE',
index=2,
number=2,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_PUSH_SPARSE_TABLE',
index=3,
number=3,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None),
_descriptor.EnumValueDescriptor(
name='PS_SAVE_ONE_TABLE',
index=5,
number=5,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_SAVE_ALL_TABLE',
index=6,
number=6,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_LOAD_ONE_TABLE',
index=7,
number=7,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_LOAD_ALL_TABLE',
index=8,
number=8,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_CLEAR_ONE_TABLE',
index=9,
number=9,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_CLEAR_ALL_TABLE',
index=10,
number=10,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_PUSH_DENSE_PARAM',
index=11,
number=11,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PS_STOP_SERVER', index=12, number=12, options=None,
type=None),
],
containing_type=None,
options=None,
serialized_start=3341,
serialized_end=3658, )
_sym_db.RegisterEnumDescriptor(_PSCMDID)
PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
PS_SPARSE_TABLE = 0
PS_DENSE_TABLE = 1
PS_PULL_DENSE_TABLE = 0
PS_PUSH_DENSE_TABLE = 1
PS_PULL_SPARSE_TABLE = 2
PS_PUSH_SPARSE_TABLE = 3
PS_SHRINK_TABLE = 4
PS_SAVE_ONE_TABLE = 5
PS_SAVE_ALL_TABLE = 6
PS_LOAD_ONE_TABLE = 7
PS_LOAD_ALL_TABLE = 8
PS_CLEAR_ONE_TABLE = 9
PS_CLEAR_ALL_TABLE = 10
PS_PUSH_DENSE_PARAM = 11
PS_STOP_SERVER = 12
_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
name='FsApiType',
full_name='paddle.FsClientParameter.FsApiType',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='HDFS', index=0, number=0, options=None, type=None),
_descriptor.EnumValueDescriptor(
name='AFS', index=1, number=1, options=None, type=None),
],
containing_type=None,
options=None,
serialized_start=3254,
serialized_end=3284, )
_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
_PSPARAMETER = _descriptor.Descriptor(
name='PSParameter',
full_name='paddle.PSParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='worker_class',
full_name='paddle.PSParameter.worker_class',
index=0,
number=1,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='server_class',
full_name='paddle.PSParameter.server_class',
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='instance_class',
full_name='paddle.PSParameter.instance_class',
index=2,
number=3,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='worker_param',
full_name='paddle.PSParameter.worker_param',
index=3,
number=101,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='server_param',
full_name='paddle.PSParameter.server_param',
index=4,
number=102,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='trainer_param',
full_name='paddle.PSParameter.trainer_param',
index=5,
number=301,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='fs_client_param',
full_name='paddle.PSParameter.fs_client_param',
index=6,
number=501,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=21,
serialized_end=307, )
_WORKERPARAMETER = _descriptor.Descriptor(
name='WorkerParameter',
full_name='paddle.WorkerParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='downpour_worker_param',
full_name='paddle.WorkerParameter.downpour_worker_param',
index=0,
number=1,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=309,
serialized_end=390, )
_SERVERPARAMETER = _descriptor.Descriptor(
name='ServerParameter',
full_name='paddle.ServerParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='downpour_server_param',
full_name='paddle.ServerParameter.downpour_server_param',
index=0,
number=1,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=392,
serialized_end=473, )
_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
name='DownpourWorkerParameter',
full_name='paddle.DownpourWorkerParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='downpour_table_param',
full_name='paddle.DownpourWorkerParameter.downpour_table_param',
index=0,
number=1,
type=11,
cpp_type=10,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=475,
serialized_end=554, )
_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
name='DownpourTrainerParameter',
full_name='paddle.DownpourTrainerParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='dense_table',
full_name='paddle.DownpourTrainerParameter.dense_table',
index=0,
number=1,
type=11,
cpp_type=10,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='sparse_table',
full_name='paddle.DownpourTrainerParameter.sparse_table',
index=1,
number=2,
type=11,
cpp_type=10,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='push_sparse_per_batch',
full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch',
index=2,
number=3,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='push_dense_per_batch',
full_name='paddle.DownpourTrainerParameter.push_dense_per_batch',
index=3,
number=4,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='skip_op',
full_name='paddle.DownpourTrainerParameter.skip_op',
index=4,
number=5,
type=9,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=557,
serialized_end=763, )
_DENSETABLEPARAMETER = _descriptor.Descriptor(
name='DenseTableParameter',
full_name='paddle.DenseTableParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='table_id',
full_name='paddle.DenseTableParameter.table_id',
index=0,
number=1,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='dense_variable_name',
full_name='paddle.DenseTableParameter.dense_variable_name',
index=1,
number=2,
type=9,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='dense_gradient_variable_name',
full_name='paddle.DenseTableParameter.dense_gradient_variable_name',
index=2,
number=3,
type=9,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='fea_dim',
full_name='paddle.DenseTableParameter.fea_dim',
index=3,
number=4,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=765,
serialized_end=888, )
_SPARSETABLEPARAMETER = _descriptor.Descriptor(
name='SparseTableParameter',
full_name='paddle.SparseTableParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='table_id',
full_name='paddle.SparseTableParameter.table_id',
index=0,
number=1,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='feature_dim',
full_name='paddle.SparseTableParameter.feature_dim',
index=1,
number=2,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='slot_key',
full_name='paddle.SparseTableParameter.slot_key',
index=2,
number=3,
type=9,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='slot_value',
full_name='paddle.SparseTableParameter.slot_value',
index=3,
number=4,
type=9,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='slot_gradient',
full_name='paddle.SparseTableParameter.slot_gradient',
index=4,
number=5,
type=9,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=890,
serialized_end=1012, )
_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
name='DownpourServerParameter',
full_name='paddle.DownpourServerParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='downpour_table_param',
full_name='paddle.DownpourServerParameter.downpour_table_param',
index=0,
number=1,
type=11,
cpp_type=10,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='service_param',
full_name='paddle.DownpourServerParameter.service_param',
index=1,
number=2,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=1015,
serialized_end=1149, )
_SERVERSERVICEPARAMETER = _descriptor.Descriptor(
name='ServerServiceParameter',
full_name='paddle.ServerServiceParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='server_class',
full_name='paddle.ServerServiceParameter.server_class',
index=0,
number=1,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("DownpourBrpcPsServer").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='client_class',
full_name='paddle.ServerServiceParameter.client_class',
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("DownpourBrpcPsClient").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='service_class',
full_name='paddle.ServerServiceParameter.service_class',
index=2,
number=3,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("DownpourPsService").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='start_server_port',
full_name='paddle.ServerServiceParameter.start_server_port',
index=3,
number=4,
type=13,
cpp_type=3,
label=1,
has_default_value=True,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='server_thread_num',
full_name='paddle.ServerServiceParameter.server_thread_num',
index=4,
number=5,
type=13,
cpp_type=3,
label=1,
has_default_value=True,
default_value=12,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=1152,
serialized_end=1367, )
_TABLEPARAMETER = _descriptor.Descriptor(
name='TableParameter',
full_name='paddle.TableParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='table_id',
full_name='paddle.TableParameter.table_id',
index=0,
number=1,
type=4,
cpp_type=4,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='table_class',
full_name='paddle.TableParameter.table_class',
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='shared_num',
full_name='paddle.TableParameter.shared_num',
index=2,
number=3,
type=4,
cpp_type=4,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='accessor',
full_name='paddle.TableParameter.accessor',
index=3,
number=4,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='type',
full_name='paddle.TableParameter.type',
index=4,
number=5,
type=14,
cpp_type=8,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='compress_in_save',
full_name='paddle.TableParameter.compress_in_save',
index=5,
number=6,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=1370,
serialized_end=1561, )
_TABLEACCESSORPARAMETER = _descriptor.Descriptor(
name='TableAccessorParameter',
full_name='paddle.TableAccessorParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='accessor_class',
full_name='paddle.TableAccessorParameter.accessor_class',
index=0,
number=1,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='sparse_sgd_param',
full_name='paddle.TableAccessorParameter.sparse_sgd_param',
index=1,
number=2,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='dense_sgd_param',
full_name='paddle.TableAccessorParameter.dense_sgd_param',
index=2,
number=3,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='fea_dim',
full_name='paddle.TableAccessorParameter.fea_dim',
index=3,
number=4,
type=13,
cpp_type=3,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='embedx_dim',
full_name='paddle.TableAccessorParameter.embedx_dim',
index=4,
number=5,
type=13,
cpp_type=3,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='embedx_threshold',
full_name='paddle.TableAccessorParameter.embedx_threshold',
index=5,
number=6,
type=13,
cpp_type=3,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='downpour_accessor_param',
full_name='paddle.TableAccessorParameter.downpour_accessor_param',
index=6,
number=7,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='table_accessor_save_param',
full_name='paddle.TableAccessorParameter.table_accessor_save_param',
index=7,
number=8,
type=11,
cpp_type=10,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=1564,
serialized_end=1933, )
_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
name='DownpourTableAccessorParameter',
full_name='paddle.DownpourTableAccessorParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='nonclk_coeff',
full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff',
index=0,
number=1,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='click_coeff',
full_name='paddle.DownpourTableAccessorParameter.click_coeff',
index=1,
number=2,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='base_threshold',
full_name='paddle.DownpourTableAccessorParameter.base_threshold',
index=2,
number=3,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='delta_threshold',
full_name='paddle.DownpourTableAccessorParameter.delta_threshold',
index=3,
number=4,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='delta_keep_days',
full_name='paddle.DownpourTableAccessorParameter.delta_keep_days',
index=4,
number=5,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='show_click_decay_rate',
full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate',
index=5,
number=6,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='delete_threshold',
full_name='paddle.DownpourTableAccessorParameter.delete_threshold',
index=6,
number=7,
type=2,
cpp_type=6,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=1936,
serialized_end=2142, )
_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
name='TableAccessorSaveParameter',
full_name='paddle.TableAccessorSaveParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='param',
full_name='paddle.TableAccessorSaveParameter.param',
index=0,
number=1,
type=13,
cpp_type=3,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='converter',
full_name='paddle.TableAccessorSaveParameter.converter',
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='deconverter',
full_name='paddle.TableAccessorSaveParameter.deconverter',
index=2,
number=3,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2144,
serialized_end=2227, )
_PSREQUESTMESSAGE = _descriptor.Descriptor(
name='PsRequestMessage',
full_name='paddle.PsRequestMessage',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='cmd_id',
full_name='paddle.PsRequestMessage.cmd_id',
index=0,
number=1,
type=13,
cpp_type=3,
label=2,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='table_id',
full_name='paddle.PsRequestMessage.table_id',
index=1,
number=2,
type=13,
cpp_type=3,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='params',
full_name='paddle.PsRequestMessage.params',
index=2,
number=3,
type=12,
cpp_type=9,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='client_id',
full_name='paddle.PsRequestMessage.client_id',
index=3,
number=4,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='data',
full_name='paddle.PsRequestMessage.data',
index=4,
number=5,
type=12,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b(""),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2229,
serialized_end=2330, )
_SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
name='SparseSGDRuleParameter',
full_name='paddle.SparseSGDRuleParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='learning_rate',
full_name='paddle.SparseSGDRuleParameter.learning_rate',
index=0,
number=1,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='initial_g2sum',
full_name='paddle.SparseSGDRuleParameter.initial_g2sum',
index=1,
number=2,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='initial_range',
full_name='paddle.SparseSGDRuleParameter.initial_range',
index=2,
number=3,
type=1,
cpp_type=5,
label=1,
has_default_value=True,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='weight_bounds',
full_name='paddle.SparseSGDRuleParameter.weight_bounds',
index=3,
number=4,
type=2,
cpp_type=6,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2332,
serialized_end=2451, )
_DENSESGDRULEPARAMETER = _descriptor.Descriptor(
name='DenseSGDRuleParameter',
full_name='paddle.DenseSGDRuleParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='name',
full_name='paddle.DenseSGDRuleParameter.name',
index=0,
number=1,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='adam',
full_name='paddle.DenseSGDRuleParameter.adam',
index=1,
number=2,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='naive',
full_name='paddle.DenseSGDRuleParameter.naive',
index=2,
number=3,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='summary',
full_name='paddle.DenseSGDRuleParameter.summary',
index=3,
number=4,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='moving_average',
full_name='paddle.DenseSGDRuleParameter.moving_average',
index=4,
number=5,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2454,
serialized_end=2679, )
_ADAMSGDPARAMETER = _descriptor.Descriptor(
name='AdamSGDParameter',
full_name='paddle.AdamSGDParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='learning_rate',
full_name='paddle.AdamSGDParameter.learning_rate',
index=0,
number=1,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='avg_decay_rate',
full_name='paddle.AdamSGDParameter.avg_decay_rate',
index=1,
number=2,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='ada_decay_rate',
full_name='paddle.AdamSGDParameter.ada_decay_rate',
index=2,
number=3,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='ada_epsilon',
full_name='paddle.AdamSGDParameter.ada_epsilon',
index=3,
number=4,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='mom_decay_rate',
full_name='paddle.AdamSGDParameter.mom_decay_rate',
index=4,
number=5,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2682,
serialized_end=2816, )
_NAIVESGDPARAMETER = _descriptor.Descriptor(
name='NaiveSGDParameter',
full_name='paddle.NaiveSGDParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='learning_rate',
full_name='paddle.NaiveSGDParameter.learning_rate',
index=0,
number=1,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='avg_decay_rate',
full_name='paddle.NaiveSGDParameter.avg_decay_rate',
index=1,
number=2,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2818,
serialized_end=2884, )
_SUMMARYSGDPARAMETER = _descriptor.Descriptor(
name='SummarySGDParameter',
full_name='paddle.SummarySGDParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='summary_decay_rate',
full_name='paddle.SummarySGDParameter.summary_decay_rate',
index=0,
number=1,
type=1,
cpp_type=5,
label=1,
has_default_value=True,
default_value=float(0.999999),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2886,
serialized_end=2945, )
_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
name='MovingAverageRuleParameter',
full_name='paddle.MovingAverageRuleParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='momentum',
full_name='paddle.MovingAverageRuleParameter.momentum',
index=0,
number=1,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2947,
serialized_end=2993, )
_PSRESPONSEMESSAGE = _descriptor.Descriptor(
name='PsResponseMessage',
full_name='paddle.PsResponseMessage',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='err_code',
full_name='paddle.PsResponseMessage.err_code',
index=0,
number=1,
type=5,
cpp_type=1,
label=2,
has_default_value=True,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='err_msg',
full_name='paddle.PsResponseMessage.err_msg',
index=1,
number=2,
type=9,
cpp_type=9,
label=2,
has_default_value=True,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='data',
full_name='paddle.PsResponseMessage.data',
index=2,
number=3,
type=12,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b(""),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=2995,
serialized_end=3068, )
_FSCLIENTPARAMETER = _descriptor.Descriptor(
name='FsClientParameter',
full_name='paddle.FsClientParameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='fs_type',
full_name='paddle.FsClientParameter.fs_type',
index=0,
number=1,
type=14,
cpp_type=8,
label=1,
has_default_value=True,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='uri',
full_name='paddle.FsClientParameter.uri',
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='user',
full_name='paddle.FsClientParameter.user',
index=2,
number=3,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='passwd',
full_name='paddle.FsClientParameter.passwd',
index=3,
number=4,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='buffer_size',
full_name='paddle.FsClientParameter.buffer_size',
index=4,
number=5,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='hadoop_bin',
full_name='paddle.FsClientParameter.hadoop_bin',
index=5,
number=51,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='afs_conf',
full_name='paddle.FsClientParameter.afs_conf',
index=6,
number=101,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[],
serialized_start=3071,
serialized_end=3284, )
_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
_PSPARAMETER.fields_by_name[
'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER
_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER
_WORKERPARAMETER.fields_by_name[
'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER
_SERVERPARAMETER.fields_by_name[
'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER
_DOWNPOURWORKERPARAMETER.fields_by_name[
'downpour_table_param'].message_type = _TABLEPARAMETER
_DOWNPOURTRAINERPARAMETER.fields_by_name[
'dense_table'].message_type = _DENSETABLEPARAMETER
_DOWNPOURTRAINERPARAMETER.fields_by_name[
'sparse_table'].message_type = _SPARSETABLEPARAMETER
_DOWNPOURSERVERPARAMETER.fields_by_name[
'downpour_table_param'].message_type = _TABLEPARAMETER
_DOWNPOURSERVERPARAMETER.fields_by_name[
'service_param'].message_type = _SERVERSERVICEPARAMETER
_TABLEPARAMETER.fields_by_name[
'accessor'].message_type = _TABLEACCESSORPARAMETER
_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE
_TABLEACCESSORPARAMETER.fields_by_name[
'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER
_TABLEACCESSORPARAMETER.fields_by_name[
'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER
_TABLEACCESSORPARAMETER.fields_by_name[
'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER
_TABLEACCESSORPARAMETER.fields_by_name[
'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER
_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER
_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER
_DENSESGDRULEPARAMETER.fields_by_name[
'summary'].message_type = _SUMMARYSGDPARAMETER
_DENSESGDRULEPARAMETER.fields_by_name[
'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER
_FSCLIENTPARAMETER.fields_by_name[
'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE
_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER
DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER
DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER
DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER
DESCRIPTOR.message_types_by_name[
'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER
DESCRIPTOR.message_types_by_name[
'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER
DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER
DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER
DESCRIPTOR.message_types_by_name[
'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER
DESCRIPTOR.message_types_by_name[
'ServerServiceParameter'] = _SERVERSERVICEPARAMETER
DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER
DESCRIPTOR.message_types_by_name[
'TableAccessorParameter'] = _TABLEACCESSORPARAMETER
DESCRIPTOR.message_types_by_name[
'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER
DESCRIPTOR.message_types_by_name[
'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER
DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE
DESCRIPTOR.message_types_by_name[
'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER
DESCRIPTOR.message_types_by_name[
'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER
DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER
DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER
DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER
DESCRIPTOR.message_types_by_name[
'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER
DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE
DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER
DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE
DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID
PSParameter = _reflection.GeneratedProtocolMessageType(
'PSParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_PSPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.PSParameter)
))
_sym_db.RegisterMessage(PSParameter)
WorkerParameter = _reflection.GeneratedProtocolMessageType(
'WorkerParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_WORKERPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
))
_sym_db.RegisterMessage(WorkerParameter)
ServerParameter = _reflection.GeneratedProtocolMessageType(
'ServerParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_SERVERPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.ServerParameter)
))
_sym_db.RegisterMessage(ServerParameter)
DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType(
'DownpourWorkerParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
))
_sym_db.RegisterMessage(DownpourWorkerParameter)
DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType(
'DownpourTrainerParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
))
_sym_db.RegisterMessage(DownpourTrainerParameter)
DenseTableParameter = _reflection.GeneratedProtocolMessageType(
'DenseTableParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_DENSETABLEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
))
_sym_db.RegisterMessage(DenseTableParameter)
SparseTableParameter = _reflection.GeneratedProtocolMessageType(
'SparseTableParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_SPARSETABLEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
))
_sym_db.RegisterMessage(SparseTableParameter)
DownpourServerParameter = _reflection.GeneratedProtocolMessageType(
'DownpourServerParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
))
_sym_db.RegisterMessage(DownpourServerParameter)
ServerServiceParameter = _reflection.GeneratedProtocolMessageType(
'ServerServiceParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_SERVERSERVICEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
))
_sym_db.RegisterMessage(ServerServiceParameter)
TableParameter = _reflection.GeneratedProtocolMessageType(
'TableParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_TABLEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.TableParameter)
))
_sym_db.RegisterMessage(TableParameter)
TableAccessorParameter = _reflection.GeneratedProtocolMessageType(
'TableAccessorParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_TABLEACCESSORPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
))
_sym_db.RegisterMessage(TableAccessorParameter)
DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType(
'DownpourTableAccessorParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter)
))
_sym_db.RegisterMessage(DownpourTableAccessorParameter)
TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType(
'TableAccessorSaveParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter)
))
_sym_db.RegisterMessage(TableAccessorSaveParameter)
PsRequestMessage = _reflection.GeneratedProtocolMessageType(
'PsRequestMessage',
(_message.Message, ),
dict(
DESCRIPTOR=_PSREQUESTMESSAGE,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
))
_sym_db.RegisterMessage(PsRequestMessage)
SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
'SparseSGDRuleParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_SPARSESGDRULEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
))
_sym_db.RegisterMessage(SparseSGDRuleParameter)
DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
'DenseSGDRuleParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_DENSESGDRULEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
))
_sym_db.RegisterMessage(DenseSGDRuleParameter)
AdamSGDParameter = _reflection.GeneratedProtocolMessageType(
'AdamSGDParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_ADAMSGDPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
))
_sym_db.RegisterMessage(AdamSGDParameter)
NaiveSGDParameter = _reflection.GeneratedProtocolMessageType(
'NaiveSGDParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_NAIVESGDPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
))
_sym_db.RegisterMessage(NaiveSGDParameter)
SummarySGDParameter = _reflection.GeneratedProtocolMessageType(
'SummarySGDParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_SUMMARYSGDPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
))
_sym_db.RegisterMessage(SummarySGDParameter)
MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType(
'MovingAverageRuleParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter)
))
_sym_db.RegisterMessage(MovingAverageRuleParameter)
PsResponseMessage = _reflection.GeneratedProtocolMessageType(
'PsResponseMessage',
(_message.Message, ),
dict(
DESCRIPTOR=_PSRESPONSEMESSAGE,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
))
_sym_db.RegisterMessage(PsResponseMessage)
FsClientParameter = _reflection.GeneratedProtocolMessageType(
'FsClientParameter',
(_message.Message, ),
dict(
DESCRIPTOR=_FSCLIENTPARAMETER,
__module__='ps_pb2'
# @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
))
_sym_db.RegisterMessage(FsClientParameter)
DESCRIPTOR.has_options = True
DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(),
_b('\200\001\001'))
# @@protoc_insertion_point(module_scope)
......@@ -563,8 +563,8 @@ class Operator(object):
OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select',
'checkpoint_notify', 'gen_nccl_id'
}
def __init__(self,
......
......@@ -226,156 +226,6 @@ class BlockGuard(object):
return True
class ParallelDo(object):
"""
ParallelDo is used to represent multi-thread data parallel processing.
Its vanilla implementation can be shown as the following (:math:`|` means
single thread and :math:`||||` means multiple threads)
.. code-block:: text
In the forward pass
| Split input onto different devices
| Copy parameter onto different devices
|||| Compute forward pass in parallel
| Merge output from different devices
In the backward pass
| Split output@grad onto different devices
|||| Compute backward pass in parallel
| accumulate param@grad from different devices to the first device
| Merge input@grad from different devices
| Copy param@grad to the place of parallel_do_op
Examples:
.. code-block:: python
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# ParallelDo version & Single-thread version
if thread_num > 1:
places = fluid.layers.get_places(thread_num)
pd = fluid.layers.control_flow.ParallelDo(places)
with pd.do():
images = pd.read_input(images)
label = pd.read_input(label)
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
pd.write_output(avg_cost)
avg_cost = pd()
avg_cost = fluid.layers.mean(avg_cost)
else:
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
.. warning::
It will be soon deprecated, please use ParallelExecutor instead.
"""
def __init__(self, places, use_nccl=False, name=None):
warnings.warn(
"API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
Warning)
self.helper = LayerHelper("parallel_do", name=name)
self.inputs = []
self.places = places
self.outputs = []
self.status = StaticRNN.BEFORE_RNN_BLOCK
self.use_nccl = use_nccl
def do(self):
return BlockGuardWithCompletion(self)
def parent_block(self):
prog = self.helper.main_program
parent_idx = prog.current_block().parent_idx
assert parent_idx >= 0
parent_block = prog.block(parent_idx)
return parent_block
def __call__(self, *args, **kwargs):
if self.status != StaticRNN.AFTER_RNN_BLOCK:
raise ValueError("RNN output can only be retrieved after rnn block")
if len(self.outputs) == 0:
raise ValueError("RNN has no output")
elif len(self.outputs) == 1:
return self.outputs[0]
else:
return self.outputs
def read_input(self, var):
self.inputs.append(var)
return var
def write_output(self, var):
self.outputs.append(var)
def get_parameters(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
local_inputs = set()
params = list()
for var in self.inputs:
local_inputs.add(var.name)
for op in current_block.ops:
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in local_inputs:
params.append(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
local_inputs.add(out_var_name)
params = list(set(params))
return [parent_block.var(name) for name in params]
def _complete_op(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
step_scope = parent_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
self.outputs = [
parent_block.create_var(
name=o.name,
shape=o.shape,
dtype=o.dtype,
lod_level=o.lod_level,
persistable=o.persistable,
stop_gradient=o.stop_gradient) for o in self.outputs
]
inputs = [parent_block.var(i.name) for i in self.inputs]
outputs = [parent_block.var(o.name) for o in self.outputs]
parent_block.append_op(
type='parallel_do',
inputs={
'inputs': inputs,
'parameters': self.get_parameters(),
'places': self.places
},
outputs={'outputs': outputs,
'parallel_scopes': [step_scope]},
attrs={'sub_block': current_block,
'use_nccl': self.use_nccl})
class BlockGuardWithCompletion(BlockGuard):
"""
BlockGuardWithCompletion class.
......@@ -384,9 +234,8 @@ class BlockGuardWithCompletion(BlockGuard):
"""
def __init__(self, rnn):
if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
raise TypeError(
"BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
if not isinstance(rnn, StaticRNN):
raise TypeError("BlockGuardWithCompletion takes a StaticRNN")
super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
self.rnn = rnn
......
......@@ -176,6 +176,7 @@ __all__ = [
'get_tensor_from_selected_rows',
'lstm',
'psroi_pool',
'huber_loss',
]
kIgnoreIndex = -100
......@@ -497,7 +498,7 @@ def lstm(input,
If Device is GPU, This op will use cudnn LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
......@@ -524,19 +525,19 @@ def lstm(input,
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
Args:
input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
init_h(Variable): The initial hidden state of the LSTM
init_h(Variable): The initial hidden state of the LSTM
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
init_c(Variable): The initial cell state of the LSTM.
This is a tensor with shape ( num_layers x batch_size x hidden_size )
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
hidden_size (int): hidden size of the LSTM
num_layers (int): total layers number of the LSTM
dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
......@@ -555,10 +556,10 @@ def lstm(input,
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
last_h(Tensor): the hidden state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
last_c(Tensor): the cell state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
Examples:
......@@ -4658,7 +4659,7 @@ def ctc_greedy_decoder(input, blank, name=None):
[0.5, 0.1, 0.3, 0.1]]
input.lod = [[4, 4]]
Computation:
step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
......@@ -4691,7 +4692,7 @@ def ctc_greedy_decoder(input, blank, name=None):
Returns:
Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
'Lp' is the sum if all output sequences' length. If all the sequences
in result were empty, the result LoDTensor will be [-1] with
in result were empty, the result LoDTensor will be [-1] with
LoD [[]] and dims [1, 1].
Examples:
......@@ -5050,7 +5051,7 @@ def hsigmoid(input,
"""
The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a
complete binary tree, or you can use is_custom to pass your own tree to
complete binary tree, or you can use is_custom to pass your own tree to
implement hierarchical. Each leaf node represents a class(a word) and each
internal node acts as a binary classifier. For each word there's a unique
path from root to it's leaf node, hsigmoid calculate the cost for each
......@@ -5066,7 +5067,7 @@ def hsigmoid(input,
2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
means label of each binary classification, using 1 indicate true, 0 indicate false.
4. now, each word should has its path and code along the path, you can pass a batch of path and code
4. now, each word should has its path and code along the path, you can pass a batch of path and code
related to the same batch of inputs.
......@@ -5076,8 +5077,8 @@ def hsigmoid(input,
and :math:`D` is the feature size.
label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N \\times 1]`.
num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
which indicates the num of classes using by binary classify.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
......@@ -5090,15 +5091,15 @@ def hsigmoid(input,
is not set, the bias is initialized zero. Default: None.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
path_table: (Variable|None) this variable can store each batch of samples' path to root,
path_table: (Variable|None) this variable can store each batch of samples' path to root,
it should be in leaf -> root order
path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
structure and each element in this array is indexes in parent nodes' Weight Matrix.
path_code: (Variable|None) this variable can store each batch of samples' code,
path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
structure and each element in this array is indexes in parent nodes' Weight Matrix.
path_code: (Variable|None) this variable can store each batch of samples' code,
each code consist with every code of parent nodes. it should be in leaf -> root order
is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
of W and input will be sparse.
Returns:
......@@ -9389,3 +9390,51 @@ def psroi_pool(input,
'pooled_width': pooled_width
})
return out
def huber_loss(input, label, delta):
"""
Huber loss is a loss function used in robust.
Huber loss can evaluate the fitness of input to label.
Different from MSE loss, Huber loss is more robust for outliers.
When the difference between input and label is large than delta
.. math::
huber\_loss = delta * (label - input) - 0.5 * delta * delta
When the difference between input and label is less than delta
.. math::
huber\_loss = 0.5 * (label - input) * (label - input)
Args:
input (Variable): This input is a probability computed by the previous operator.
The first dimension is batch size, and the last dimension is 1.
label (Variable): The groud truth whose first dimension is batch size
and last dimension is 1.
delta (float): The parameter of huber loss, which controls
the range of outliers
Returns:
huber\_loss (Variable): The huber loss with shape [batch_size, 1].
Examples:
.. code-block:: python
predictions = fluid.layers.softmax(x)
loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
"""
helper = LayerHelper('huber_loss', **locals())
residual = helper.create_variable_for_type_inference(
dtype=helper.input_dtype())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
helper.append_op(
type='huber_loss',
inputs={'X': input,
'Y': label},
outputs={'Out': out,
'Residual': residual},
attrs={'delta': delta})
return out
......@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
the accumulators are updated at every step. Every element of the two moving-average is updated
in both dense mode and sparse mode. If the size of parameter is very large, then the update
may be very slow. The lazy mode only update the element that has gradient is the current
mini-batch, so it will be much more faster. But this mode has different semantics with the
original Adam algorithm and may lead to different result.
Examples:
.. code-block:: python
......@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
beta2=0.999,
epsilon=1e-8,
regularization=None,
name=None):
name=None,
lazy_mode=False):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
self._lazy_mode = lazy_mode
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon
"epsilon": self._epsilon,
"lazy_mode": self._lazy_mode
})
return adam_op
......
......@@ -15,7 +15,6 @@
from __future__ import print_function
from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
import unittest
import paddle.fluid as fluid
import paddle
......@@ -147,22 +146,7 @@ def train(word_dict,
cost, acc_out, prediction = net_method(
data, label, input_dim=dict_dim, class_dim=class_dim)
else:
places = get_places()
pd = ParallelDo(places)
with pd.do():
cost, acc, _ = net_method(
pd.read_input(data),
pd.read_input(label),
input_dim=dict_dim,
class_dim=class_dim)
pd.write_output(cost)
pd.write_output(acc)
cost, acc = pd()
cost = fluid.layers.mean(cost)
acc_out = fluid.layers.mean(acc)
prediction = None
assert save_dirname is None
raise NotImplementedError()
adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
adagrad.minimize(cost)
......
......@@ -25,7 +25,6 @@ import numpy
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
BATCH_SIZE = 64
......@@ -82,19 +81,7 @@ def train(nn_type,
net_conf = conv_net
if parallel:
places = get_places()
pd = ParallelDo(places)
with pd.do():
img_ = pd.read_input(img)
label_ = pd.read_input(label)
prediction, avg_loss, acc = net_conf(img_, label_)
for o in [avg_loss, acc]:
pd.write_output(o)
avg_loss, acc = pd()
# get mean loss and acc through every devices.
avg_loss = fluid.layers.mean(avg_loss)
acc = fluid.layers.mean(acc)
raise NotImplementedError()
else:
prediction, avg_loss, acc = net_conf(img, label)
......@@ -273,7 +260,7 @@ def inject_all_tests():
for use_cuda in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
for parallel in (False, True):
for parallel in (False, ):
for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type, True)
......
......@@ -17,7 +17,6 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
import unittest
import os
import numpy as np
......@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
avg_cost, predict_word = __network__(
[first_word, second_word, third_word, forth_word, next_word])
else:
places = get_places()
pd = ParallelDo(places)
with pd.do():
avg_cost, predict_word = __network__(
list(
map(pd.read_input, [
first_word, second_word, third_word, forth_word,
next_word
])))
pd.write_output(avg_cost)
avg_cost = fluid.layers.mean(pd())
raise NotImplementedError()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
......@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
for use_cuda in (False, True):
for is_sparse in (False, True):
for is_parallel in (False, True):
for is_parallel in (False, ):
inject_test_method(use_cuda, is_sparse, is_parallel)
if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import math
import sys
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
# need to fix random seed and training data to compare the loss
# value accurately calculated by the default and the memory optimization
# version.
fluid.default_startup_program().random_seed = 111
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
device_type = 'CPU'
use_nccl = False
place = fluid.CPUPlace()
if fluid.core.is_compiled_with_cuda():
device_type = 'CUDA'
use_nccl = False
place = fluid.CUDAPlace(0)
places = get_places(device_count=0, device_type=device_type)
pd = ParallelDo(places, use_nccl=use_nccl)
with pd.do():
x_ = pd.read_input(x)
y_ = pd.read_input(y)
y_predict = fluid.layers.fc(input=x_, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y_)
avg_cost = fluid.layers.mean(x=cost)
pd.write_output(avg_cost)
cost = pd()
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program(), print_log=True)
# fluid.release_memory(fluid.default_main_program())
BATCH_SIZE = 200
# fix the order of training data
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
# train_reader = paddle.batch(
# paddle.reader.shuffle(
# paddle.dataset.uci_housing.train(), buf_size=500),
# batch_size=BATCH_SIZE)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good.
print(avg_loss_value[0])
if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.")
exit(1)
......@@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase):
seed=None,
use_parallel_executor=True,
use_reduce=False,
use_ir_memory_optimize=False,
fuse_elewise_add_act_ops=False,
optimizer=fluid.optimizer.Adam,
use_fast_executor=False,
......@@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.memory_optimize = use_ir_memory_optimize
build_strategy.enable_sequential_execution = enable_sequential_execution
if use_cuda and core.is_compiled_with_cuda():
build_strategy.remove_unnecessary_lock = True
......
......@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
lazy_mode):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
......@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
moment2_out = np.zeros(shape=[height, row_numel])
param_out = np.zeros(shape=[height, row_numel])
for idx, row_id in enumerate(rows):
def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
) * np_grad[idx]
) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(np_grad[idx])
1 - beta2) * np.square(update_value)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
np.sqrt(moment2_out[row_id]) + epsilon))
if lazy_mode:
for idx, row_id in enumerate(rows):
update_row(row_id, np_grad[idx])
else:
for row_id in range(param_out.shape[0]):
update_value = np.zeros(np_grad[0].shape).astype("float32")
if row_id in rows:
update_value = np_grad[rows.index(row_id)]
update_row(row_id, update_value)
return param_out, moment1_out, moment2_out
class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place):
def setup(self, scope, place, lazy_mode):
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
......@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
'Beta2Pow': np.array([beta2**10]).astype("float32"),
"LearningRate": np.full((1), 2.0).astype("float32")
}
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
grad_selected_rows = scope.var('Grad').get_selected_rows()
......@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(
self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
height, rows, row_numel,
np_array, lazy_mode)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2
}
def check_with_place(self, place):
def check_with_place(self, place, lazy_mode):
scope = core.Scope()
self.setup(scope, place)
self.setup(scope, place, lazy_mode)
op_args = dict()
op_args['lazy_mode'] = lazy_mode
for key, np_array in self.dense_inputs.items():
var = scope.var(key).get_tensor()
var.set(np_array, place)
......@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
op_args[s] = s
for s in self.outputs:
var = scope.var(s).get_tensor()
var.set(self.outputs[s], place)
var.set(self.init_output, place)
op_args[s] = s
for k in self.attrs:
op_args[k] = self.attrs[k]
......@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
actual = np.array(out_var)
actual = actual.reshape([actual.size])
np_array = np_array.reshape([np_array.size])
for idx, row_id in enumerate(self.rows):
j = 0
while j < self.row_numel:
pos = row_id * self.row_numel + j
self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
0.00001)
j += 1
def test_sparse_sgd(self):
for i in range(np_array.size):
self.assertLess((actual[i] - np_array[i]), 0.00001)
def test_sparse_adam(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
for lazy_mode in (True, False):
self.check_with_place(place, lazy_mode)
if __name__ == "__main__":
......
......@@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost = network(data, label, len(word_dict))
cost.persistable = True
optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
optimizer.minimize(cost)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
def _feed_data_helper(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
return img, label
def simple_fc_net(use_feed):
x, y = _feed_data_helper(use_feed)
hidden_layer = 4
for _ in range(hidden_layer):
x = fluid.layers.fc(input=x, size=20, act='relu')
y_predict = fluid.layers.fc(input=x, size=10, act='softmax')
cost = fluid.layers.cross_entropy(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
return avg_cost
def fc_with_inplace_net(use_feed):
x, y = _feed_data_helper(use_feed)
fc = fluid.layers.fc(input=x, size=20, act='relu')
fc = fluid.layers.fc(input=fc, size=10, act='relu')
reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5])
reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2])
y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax')
cost = fluid.layers.cross_entropy(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
return avg_cost
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def _dummy_data(self):
np.random.seed(5)
img = np.random.random(size=[32, 784]).astype(np.float32)
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_ir_and_python_memory_optimize(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._dummy_data()
first_loss0, last_loss0 = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
memory_opt=False,
use_ir_memory_optimize=False)
first_loss1, last_loss1 = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
memory_opt=False,
use_ir_memory_optimize=True)
for loss in zip(first_loss0, first_loss1):
self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
for loss in zip(last_loss0, last_loss1):
self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
def test_simple_fc_net(self):
self._compare_ir_and_python_memory_optimize(simple_fc_net, False)
self._compare_ir_and_python_memory_optimize(simple_fc_net, True)
def test_fc_with_reshape_net(self):
self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False)
self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
import paddle.fluid.profiler as profiler
import numpy
import six
class BaseParallelForTest(unittest.TestCase):
def run_test(self, callback, feed, fetch):
"""
Run the unittest for parallel.for
Args:
callback(callable): A callable function returns a generator. There
are two yields in the generator function. The first yield
returns the data layers, and the second yield returns the loss.
The modified data variables will be sent back during the first
yield.
feed(dict): The executor feeding dictionary.
fetch(list|basestr): The fetch name lists.
Returns:
None
Raises:
AssertionError when the computation of cpu, parallel.for in cpu,
gpu, parallel.for in gpu are different.
"""
cpu = fluid.CPUPlace()
result_cpu = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=cpu,
use_parallel=False)
result_cpu_parallel = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=cpu,
use_parallel=True)
if fluid.core.is_compiled_with_cuda():
gpu = fluid.CUDAPlace(0)
result_gpu = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=gpu,
use_parallel=False,
use_gpu=True)
result_gpu_parallel = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=gpu,
use_parallel=True,
use_gpu=True)
result_gpu_nccl = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=gpu,
use_parallel=True,
use_nccl=True,
use_gpu=True)
self._assert_same_(fetch, result_cpu, result_cpu_parallel,
result_gpu, result_gpu_parallel, result_gpu_nccl)
else:
self._assert_same_(fetch, result_cpu, result_cpu_parallel)
def _run_test_impl_(self,
callback,
feed,
fetch,
place,
use_parallel=False,
use_nccl=False,
use_gpu=False):
"""
Run a single test, returns the fetch values
Args:
place(Place): the computation place.
use_parallel(bool): Whether use parallel.for or not.
Returns:
Fetched numpy arrays.
"""
if isinstance(fetch, six.string_types):
fetch = [fetch]
main = fluid.Program()
startup = fluid.Program()
# Fix seed
main.random_seed = 10
startup.random_seed = 10
with fluid.program_guard(main, startup):
generator = callback()
# Automatically insert parallel do if use_parallel = True
if use_parallel:
thread_num = fluid.core.get_cuda_device_count(
) if use_gpu else 8
places = get_places(thread_num)
pd = ParallelDo(places, use_nccl=use_nccl)
data = next(generator)
if isinstance(data, fluid.framework.Variable):
data = [data]
with pd.do():
ins = list(map(pd.read_input, data))
if len(ins) == 1:
ins = ins[0]
loss = generator.send(ins) # patch input
pd.write_output(loss)
loss = pd()
else:
data = next(generator)
loss = generator.send(data)
self.assertIsNotNone(loss)
avg_loss = fluid.layers.mean(loss)
fluid.backward.append_backward(loss=avg_loss)
exe = fluid.Executor(place)
exe.run(startup)
if use_gpu:
profile_type = 'GPU'
else:
profile_type = 'CPU'
with profiler.profiler(profile_type, 'total', '/tmp/profiler'):
return exe.run(main, feed=feed, fetch_list=fetch)
def _assert_same_(self, fetch, *args):
"""
Assert the return values of `run_test` are same.
Args:
fetch: Fetch list. Used for print error message
*args: The fetch result lists of each situations.
Returns:
None
Raises:
AssertionError
"""
def _impl_(a, b, fetch_id, item_id):
item_str = [
'CPU', 'ParallelCPU', 'GPU', 'ParallelGPU', 'ParallelGPUNCCL'
]
flag = numpy.allclose(a, b, rtol=0.1, atol=1e-3)
self.assertTrue(flag,
"The {0} are different in {1}, {2} vs {3}".format(
fetch[fetch_id], item_str[item_id], a, b))
for i, items in enumerate(zip(*args)):
self.assertGreater(len(items), 0)
for j in range(1, len(items)):
_impl_(items[0], items[j], fetch_id=i, item_id=j)
class ParallelOpTest(BaseParallelForTest):
@staticmethod
def __network__():
x = fluid.layers.data(shape=[784], dtype='float32', name='img')
x = yield x
hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden = fluid.layers.batch_norm(input=hidden)
loss = fluid.layers.mean(hidden)
yield loss
def test_simple_fc(self):
self.run_test(
callback=self.__network__,
feed={
'img': numpy.random.random(size=(51, 784)).astype('float32')
},
fetch=['fc1.w@GRAD'])
def test_fc_with_tiny_data(self):
self.run_test(
callback=self.__network__,
feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
fetch=['fc1.w@GRAD'])
class ParallelOpTestMultipleInput(BaseParallelForTest):
@staticmethod
def __network__():
x = fluid.layers.data(
shape=[784], dtype='float32', name='img1', stop_gradient=False)
y = fluid.layers.data(
shape=[784], dtype='float32', name='img2', stop_gradient=False)
yield [x, y]
x = x + y
hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
loss = fluid.layers.mean(hidden3)
yield loss
def test_simple_fc(self):
self.run_test(
callback=self.__network__,
feed={
'img1': numpy.random.random(size=(51, 784)).astype('float32'),
'img2': numpy.random.random(size=(51, 784)).astype('float32')
},
fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
if __name__ == '__main__':
unittest.main()
......@@ -35,14 +35,14 @@ dtype_to_size = {
}
SUB_BLOCK_OPS = [
"while", "while_grad", "parallel_do", "parallel_do_grad",
"conditional_block", "conditional_block_grad"
"while", "while_grad", "conditional_block", "conditional_block_grad"
]
SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
SUB_BLOCK_PAIR = [("while", "while_grad"),
("conditional_block", "conditional_block_grad")]
PRINT_LOG = False
FLAGS_memory_optimize = ""
class OrderedSet(MutableSet):
......@@ -121,6 +121,7 @@ class ControlFlowGraph(object):
self._defs = defaultdict(OrderedSet)
self._live_in = defaultdict(OrderedSet)
self._live_out = defaultdict(OrderedSet)
self._skip_opt = skip_opt
self.pool = []
......@@ -144,7 +145,6 @@ class ControlFlowGraph(object):
for i in range(self.op_size):
self._uses[i].update(self._ops[i].input_arg_names())
self._defs[i].update(self._ops[i].output_arg_names())
self._live_in[i] = self._uses[i]
def _update_graph(self, old_name, new_name, begin_idx=0):
for i in range(begin_idx, self.op_size):
......@@ -177,20 +177,52 @@ class ControlFlowGraph(object):
worklist.append(d)
def _fill_pool(self, i, is_forward):
def comparator(x, cache):
x_shape = x[1]
cache_shape = cache[1]
x_size = abs(reduce(lambda x, y: x * y, x_shape))
cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
if (x_shape[0] == -1 and cache_shape[0] == -1) or \
(x_shape[0] != -1 and cache_shape[0] != -1) :
return x_size <= cache_size
else:
return False
def find_var_in_block(x):
known_vars = set()
for op in self._ops:
known_vars.update(op.output_arg_names())
return x in known_vars
block_desc = self._ops[i].block()
in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
# NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize = [
x for x in in_diff
x for x in sorted(in_diff)
if self._check_var_validity(block_desc, x, is_forward)
]
if can_optimize:
for var_name in can_optimize:
cache = (var_name, self._find_var(block_desc, var_name,
is_forward).shape())
if cache not in self.pool:
self.pool.append(cache)
if cache not in self.pool and find_var_in_block(var_name):
i = 0
while i < len(self.pool):
mycache = self.pool[i]
mysize = mycache[1][0]
cache_size = cache[1][0]
if (mysize == -1 and cache_size == -1) or \
(mysize != -1 and cache_size != -1):
if comparator(mycache, cache):
i += 1
else:
break
elif mysize == -1 and cache_size != -1:
i += 1
elif mysize != -1 and cache_size == -1:
break
self.pool.insert(i, cache)
def _get_diff(self, a, b):
u = a & b
......@@ -229,7 +261,7 @@ class ControlFlowGraph(object):
def _update_skip_opt_set(self):
for i in range(self.op_size):
op = self._ops[i]
if op.type() == "fill_constant" and op.attr("force_cpu") == True:
if op.has_attr("force_cpu") and op.attr("force_cpu") == True:
self._skip_opt.update(op.output_arg_names())
def release_memory(self, skip_opt_set=None):
......@@ -281,6 +313,7 @@ class ControlFlowGraph(object):
# update skip set to meet users' demand
if skip_opt_set:
self._skip_opt.update(skip_opt_set)
counter = 0
for i in range(self.op_size):
op = self._ops[i]
if op.type() in SUB_BLOCK_OPS:
......@@ -301,6 +334,9 @@ class ControlFlowGraph(object):
# If x is both in uses and defs, it can not be optimized!
if x in self._uses[i]:
continue
if x == FLAGS_memory_optimize:
print("start match var ", x, " of op ", op.type())
print(self.pool)
for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0]
cache_shape = cache_pair[1]
......@@ -323,15 +359,13 @@ class ControlFlowGraph(object):
if not compare_shape(x_shape, cache_shape, level):
continue
# TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
if x_dtype != cache_dtype:
continue
if PRINT_LOG:
print(("Hit Cache !!!! cache pool index "
"is %d, var name is %s, "
"cached var name is %s, "
"var shape is %s ") % (index, x, cache_var,
str(cache_shape)))
print(
("!!! %d, %s => %s, cache idx %d, pool size %d"
% (counter, x + str(x_shape),
cache_var + str(cache_shape), index,
len(self.pool))))
counter += 1
self.pool.pop(index)
# Rename the var to the cache var already with
# memory allocated in order to reuse the memory.
......@@ -484,8 +518,11 @@ def memory_optimize(input_program,
if level != 0 and level != 1:
raise ValueError("only support opt_level 0 or 1.")
if skip_opt_set is not None and not isinstance(skip_opt_set, set):
raise ValueError("only support skip_opt_set as set.")
if skip_opt_set is not None:
if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list):
skip_opt_set = set(skip_opt_set)
else:
raise ValueError("only support skip_opt_set as set.")
global PRINT_LOG
PRINT_LOG = print_log
if skip_grads:
......
......@@ -104,8 +104,10 @@ packages=['paddle',
'paddle.fluid.imperative',
'paddle.fluid.proto',
'paddle.fluid.proto.profiler',
'paddle.fluid.distributed',
'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.contrib.utils',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.transpiler',
......@@ -241,5 +243,6 @@ setup(name='${PACKAGE_NAME}',
ext_modules=ext_modules,
package_data=package_data,
package_dir=package_dir,
scripts=paddle_bins
scripts=paddle_bins,
distclass=BinaryDistribution
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册