提交 c2ccf145 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/pyramiddnn

...@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " ...@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32) if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=") add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif(WIN32) endif(WIN32)
find_package(CUDA QUIET) find_package(CUDA QUIET)
......
...@@ -152,7 +152,12 @@ endif() ...@@ -152,7 +152,12 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB) if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
set(OPENMP_FLAGS "-fopenmp") if(WIN32)
# openmp not support well for now on windows
set(OPENMP_FLAGS "")
else(WIN32)
set(OPENMP_FLAGS "-fopenmp")
endif(WIN32)
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
......
...@@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") ...@@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32) if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead # nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
else(NOT WIN32) else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
list(APPEND CUDA_NVCC_FLAGS "-g -G") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# match the cl's _ITERATOR_DEBUG_LEVEL list(APPEND CUDA_NVCC_FLAGS "-g -G")
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") # match the cl's _ITERATOR_DEBUG_LEVEL
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
else() list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif() endif()
endif(NOT WIN32) endif(NOT WIN32)
......
...@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire ...@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire
IF(WIN32) IF(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
ELSE(WIN32) ELSE(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32) ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
...@@ -39,7 +41,7 @@ ExternalProject_Add( ...@@ -39,7 +41,7 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......
...@@ -49,6 +49,8 @@ IF(NOT WIN32) ...@@ -49,6 +49,8 @@ IF(NOT WIN32)
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ELSE()
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
ENDIF(NOT WIN32) ENDIF(NOT WIN32)
ExternalProject_Add( ExternalProject_Add(
...@@ -61,7 +63,6 @@ ExternalProject_Add( ...@@ -61,7 +63,6 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......
...@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) ...@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add( ExternalProject_Add(
extern_snappy extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy" GIT_REPOSITORY "https://github.com/google/snappy"
...@@ -31,7 +37,7 @@ ExternalProject_Add( ...@@ -31,7 +37,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
......
...@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag) ...@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
endif() endif()
endif() endif()
endif() endif()
endfunction() endfunction()
...@@ -147,12 +147,7 @@ set(GPU_COMMON_FLAGS ...@@ -147,12 +147,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32) endif(NOT WIN32)
if (APPLE) if (APPLE)
...@@ -193,8 +188,7 @@ safe_set_static_flag() ...@@ -193,8 +188,7 @@ safe_set_static_flag()
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3") string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") set(flag_var "${flag_var} /w")
endif(${flag_var} MATCHES "/W3")
endforeach(flag_var) endforeach(flag_var)
endif(WIN32) endif(WIN32)
...@@ -8,13 +8,13 @@ paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None ...@@ -8,13 +8,13 @@ paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None
paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
...@@ -66,7 +66,7 @@ paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'unifo ...@@ -66,7 +66,7 @@ paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'unifo
paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
...@@ -229,7 +229,7 @@ paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', ...@@ -229,7 +229,7 @@ paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes',
paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
...@@ -270,7 +270,7 @@ paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywo ...@@ -270,7 +270,7 @@ paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywo
paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')) paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
...@@ -346,12 +346,12 @@ paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'st ...@@ -346,12 +346,12 @@ paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'st
paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)) paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
...@@ -456,7 +456,7 @@ paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', ' ...@@ -456,7 +456,7 @@ paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', '
paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
...@@ -491,14 +491,14 @@ paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], ...@@ -491,14 +491,14 @@ paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'],
paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None) paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)) paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None) paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None) paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
......
...@@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) ...@@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
if(WITH_DISTRIBUTE) if(WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog set(NGRAPH_EXE_DEPS ngraph_engine)
lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) else()
set(NGRAPH_EXE_DEPS)
endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") if(WITH_DISTRIBUTE)
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
if (WITH_NGRAPH) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
else ()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
......
...@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, ...@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
auto& block = main_program.Block(0); auto& block = main_program.Block(0);
for (auto var_name : fetch_var_names) { for (auto var_name : fetch_var_names) {
auto var_desc = block.FindVar(var_name); auto var_desc = block.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
auto shapes = var_desc->GetShape(); auto shapes = var_desc->GetShape();
PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1, PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
"var %s: Fetched var has wrong shape, " "var %s: Fetched var has wrong shape, "
......
...@@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap ...@@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
...@@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he ...@@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU) if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif() endif()
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......
...@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
new std::vector<OpDesc *>(main_program.Block(0).AllOps()); new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs, graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership all_op_descs); // take ownership
graph->Set<GraphNodePool>(kGraphNodePool,
new GraphNodePool); // take ownership
pass->Erase(kAllOpDescs); pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs); pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
......
...@@ -77,9 +77,6 @@ struct BuildStrategy { ...@@ -77,9 +77,6 @@ struct BuildStrategy {
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool memory_optimize_{false}; bool memory_optimize_{false};
bool memory_early_delete_{false};
// TODO(dzhwinter): // TODO(dzhwinter):
// make enable_inplace, memory_optimize_ // make enable_inplace, memory_optimize_
// memory_early_delete_ true by default // memory_early_delete_ true by default
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct ComputationOpHandle : public OpHandleBase { class ComputationOpHandle : public OpHandleBase {
public: public:
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
size_t scope_idx); size_t scope_idx);
......
...@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
->Var(details::kLocalExecScopeName) ->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope; ->GetMutable<Scope*>() = &local_scope;
for (size_t j = 0; j < input_scope_idxes.size(); ++j) { for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
local_scope.Var("out_var" + j); local_scope.Var("out_var" + std::to_string(j));
if (i == j) local_scope.Var("in_var" + j); if (i == j) local_scope.Var("in_var" + std::to_string(j));
} }
param_scopes_.emplace_back(&local_scope); param_scopes_.emplace_back(&local_scope);
} }
...@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
// add input var handle // add input var handle
nodes_.emplace_back( nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable)); ir::Node::Type::kVariable));
VarHandle* in_var_handle = VarHandle* in_var_handle = new VarHandle(
new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i], nodes_.back().get(), 1, input_scope_idxes[i],
"in_var" + i, place_list_[input_scope_idxes[i]]); "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
vars_.emplace_back(in_var_handle); vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle); op_handle_->AddInput(in_var_handle);
// add output var handle // add output var handle
for (size_t j = 0; j < place_list_.size(); ++j) { for (size_t j = 0; j < place_list_.size(); ++j) {
nodes_.emplace_back( nodes_.emplace_back(ir::CreateNodeForTest(
ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable)); "out_node" + std::to_string(i), ir::Node::Type::kVariable));
VarHandle* out_var_handle = new VarHandle( VarHandle* out_var_handle =
nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]); new VarHandle(nodes_.back().get(), 2, j,
"out_var" + std::to_string(i), place_list_[j]);
vars_.emplace_back(out_var_handle); vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle); op_handle_->AddOutput(out_var_handle);
} }
...@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::vector<float>> send_vec; std::vector<std::vector<float>> send_vec;
f::LoD lod{{0, 10, 20}}; f::LoD lod{{0, 10, 20}};
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i); const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast<float>(i); float val_scalar = static_cast<float>(i);
send_vec.push_back( send_vec.push_back(
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
...@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll(); WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i); const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) { for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
} }
...@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; 2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2); int height = static_cast<int>(kDims[0] * 2);
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i); const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast<float>(i); float val_scalar = static_cast<float>(i);
send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
rows, height, val_scalar)); rows, height, val_scalar));
...@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll(); WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i); const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) { for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
height); height);
......
...@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var, ...@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
} }
} }
const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, const NodeSwapQueue InplacePass::TryInplaceModifyVar(
const std::string& cache_var, const std::string& var, const std::string& cache_var, const size_t& idx,
const size_t& idx, ir::Graph* graph) const {
ir::Graph* graph) const {
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
var_nodes_[var].at(0)->Var() != nullptr); var_nodes_[var].at(0)->Var() != nullptr);
std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
var_desc->SetName(cache_var); var_desc->SetName(cache_var);
SSANodePair swap_nodes; NodeSwapQueue swap_nodes;
for (size_t i = idx; i < view_.AllOps().size(); ++i) { for (size_t i = idx; i < view_.AllOps().size(); ++i) {
auto* op = view_.AllOps()[i]; auto* op = view_.AllOps()[i];
...@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, ...@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
return swap_nodes; return swap_nodes;
} }
void InplacePass::CommitModify(const SSANodePair& swap_nodes, void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
ir::Graph* graph) const { ir::Graph* graph) const {
for (auto& pair : swap_nodes) { for (auto& pair : swap_nodes) {
auto *node = pair.first, *cache_node = pair.second; auto *node = pair.first, *cache_node = pair.second;
...@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes, ...@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
} }
} }
void InplacePass::WithdrawModify(const SSANodePair& nodes, void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
ir::Graph* graph) const { ir::Graph* graph) const {
for (auto& pair : nodes) { for (auto& pair : nodes) {
auto *node = pair.first, *cache_node = pair.second; auto *node = pair.first, *cache_node = pair.second;
......
...@@ -56,7 +56,8 @@ class GraphView { ...@@ -56,7 +56,8 @@ class GraphView {
std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_; std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
}; };
typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair; // swap pairs in sequence
typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
class InplacePass : public ir::Pass { class InplacePass : public ir::Pass {
public: public:
InplacePass(); InplacePass();
...@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass { ...@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
private: private:
const SSANodePair TryInplaceModifyVar(const std::string& var, const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
const std::string& cache_var, const std::string& cache_var,
const size_t& idx, const size_t& idx,
ir::Graph* graph) const; ir::Graph* graph) const;
void CommitModify(const SSANodePair&, ir::Graph* graph) const; void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
const size_t& idx) const; const size_t& idx) const;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
#include <queue>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
std::queue<VarHandleBase*> queue;
queue.push(var_in);
do {
auto* var = queue.front();
queue.pop();
for (auto* op : var->PendingOps()) {
auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) {
return compute_op;
}
for (auto* out_var : op->Outputs()) {
queue.push(out_var);
}
}
} while (!queue.empty());
return nullptr;
}
std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
unlived_vars.reserve(graph_pool.size());
for (auto& pair : graph_pool) {
unlived_vars.insert(std::make_pair(pair.first, pair.second));
}
auto compare_and_insert_early_delete_op = [&](
OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
if (unlived_vars.empty()) return;
// unlived vars can be deleted after the last used op has finished.
auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
for (auto& var : vars) {
auto* var_handle = dynamic_cast<VarHandle*>(var);
auto var_name = var->Node()->Name();
auto& var_place = var_handle->place();
if (unlived_vars.count(var_name) == 0) continue;
if (!unlived_vars[var_name].empty()) {
if (compute_op != nullptr &&
unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
unlived_vars[var_name].erase(compute_op->Node()->Op());
}
continue;
}
if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
var_handle->Node()->IsCtrlVar())
continue;
// shameless copyed from reference count pass.
if (compute_op == nullptr) {
// use next computation op scope
compute_op = FindNextComputationOpHandle(var_handle);
}
auto* early_delete_node =
graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
auto* early_delete_handle = new EarlyDeleteOpHandle(
early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
if (compute_op->Outputs().empty()) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
compute_op->AddOutput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
early_delete_handle->AddInput(compute_op->Outputs().front());
VLOG(5) << "Add early delete op " << var_name << " to Operator"
<< compute_op->Name();
}
};
auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
for (auto& op : all_ops) {
compare_and_insert_early_delete_op(op, op->Inputs());
compare_and_insert_early_delete_op(op, op->Outputs());
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(memory_early_delete_pass,
paddle::framework::details::MemoryEarlyDeletePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kGarbageCollector);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/early_delete_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
class MemoryEarlyDeletePass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -13,17 +13,108 @@ ...@@ -13,17 +13,108 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <deque>
#include <functional> #include <functional>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include "paddle/fluid/framework/var_desc.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
using paddle::framework::VarDesc;
size_t NodeSizeInBytes(const VarDesc& node) { std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
size_t NodeSize(const VarDesc& node) {
auto shape = node.GetShape(); auto shape = node.GetShape();
int size = int size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
...@@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) { ...@@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
return type_size * std::abs(size); return type_size * std::abs(size);
} }
size_t NodeSizeInBytes(ir::Node* n) { size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n); auto* desc = FindVarDescInBlock(n);
return NodeSizeInBytes(*desc); return NodeSize(*desc);
} }
std::string DebugStringImpl(VarDesc* var) { std::string DebugStringImpl(VarDesc* var) {
...@@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) { ...@@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) {
std::string DebugString(ir::Node* var) { std::string DebugString(ir::Node* var) {
return DebugStringImpl(FindVarDescInBlock(var)); return DebugStringImpl(FindVarDescInBlock(var));
} }
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused // NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will // by a small size node, then next time it appear in pool, it will
...@@ -80,18 +170,17 @@ struct NodeComparator { ...@@ -80,18 +170,17 @@ struct NodeComparator {
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) { (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); return NodeSize(lhs) <= NodeSize(rhs);
} else { } else {
return false; return false;
} }
} }
}; };
void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { void OrderedSet::Insert(ir::Node* var) {
PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
PADDLE_ENFORCE(op->IsOp());
if (mark_table_.count(var->Name()) != 0) { if (mark_table_.count(var->Name()) != 0) {
mark_table_[var->Name()]->second.insert(op); mark_table_[var->Name()]->emplace_back(var);
return; return;
} }
...@@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { ...@@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
auto var_shape = var_desc->GetShape(); auto var_shape = var_desc->GetShape();
int batch_size = static_cast<int>(var_shape[0]); int batch_size = static_cast<int>(var_shape[0]);
NodeComparator compare_node; NodeComparator functor;
Iter it = nodes_.begin(); Iter it = nodes_.begin();
while (it != nodes_.end()) { while (it != nodes_.end()) {
auto* cache_desc = FindVarDescInBlock(it->first); auto& prev = it->front();
auto* cache_desc = FindVarDescInBlock(prev);
int cache_batch_size = cache_desc->GetShape()[0]; int cache_batch_size = cache_desc->GetShape()[0];
if ((cache_batch_size == -1 && batch_size == -1) || if ((cache_batch_size == -1 && batch_size == -1) ||
(cache_batch_size != -1 && batch_size != -1)) { (cache_batch_size != -1 && batch_size != -1)) {
if (compare_node(it->first, var)) { if (functor(prev, var)) {
++it; ++it;
} else { } else {
break; break;
...@@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { ...@@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
} }
} }
it = it = nodes_.insert(it, {var});
nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
mark_table_[var->Name()] = it; mark_table_[var->Name()] = it;
} }
int OrderedNodeList::GetIndex(ir::Node* var) { int OrderedSet::GetNodeIndexInPool(ir::Node* var) {
return std::distance(nodes_.begin(), mark_table_[var->Name()]); return std::distance(nodes_.begin(), mark_table_[var->Name()]);
} }
ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
ir::Node* found_node = nullptr; ir::Node* found_node = nullptr;
NodeComparator compare_node; NodeComparator functor;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
if (compare_node(var, it->first)) { auto& candidate = it->front();
found_node = it->first; if (functor(var, candidate)) {
found_node = candidate;
break; break;
} }
} }
return found_node; return found_node;
} }
void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name());
auto iter =
std::find_if(node_in_samename->begin(), node_in_samename->end(),
[&](ir::Node* n) { return n->Name() == var->Name(); });
return iter != node_in_samename->end();
}
return false;
}
void OrderedNodeList::Erase(const std::string& var) { void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(mark_table_.count(var)); PADDLE_ENFORCE(mark_table_.count(var->Name()));
nodes_.erase(mark_table_[var]); nodes_.erase(mark_table_[var->Name()]);
mark_table_.erase(var); mark_table_.erase(var->Name());
} }
std::string OrderedNodeList::ToString() const { std::string OrderedSet::ToString() const {
std::stringstream ss; std::stringstream ss;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
ss << DebugString(it->first) << " "; for (auto& node : *it) {
ss << DebugString(node) << " ";
}
} }
return ss.str(); return ss.str();
} }
bool NodeCanReused(ir::Node* node) { bool NodeCanReused(ir::Node* node) {
// valid the node is a var node
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
// auto* desc = node->Var();
bool flag = NodeCanReused(*node->Var()); bool flag = true;
// op output force generated in cpu, can not be reused.
for (auto* op : node->inputs) { for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) { if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
flag &= framework::AttrReader(op->Op()->GetAttrMap()) flag &= framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0; .Get<bool>("force_cpu") == 0;
} }
} }
// var desc validation.
flag &= NodeCanReused(*node->Var());
return flag; return flag;
} }
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
if (node.Persistable() || type != proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
node.GetShape().empty()) { type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false;
}
if (node.Persistable() || node.GetShape().empty()) {
return false; return false;
} }
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
...@@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) { ...@@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) {
return false; return false;
} }
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
if (output->Name() == name) {
found_node = output;
}
}
}
return found_node;
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <iostream> #include <iostream>
#include <iterator> #include <iterator>
#include <list> #include <list>
#include <map>
#include <set>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -27,41 +29,41 @@ namespace paddle { ...@@ -27,41 +29,41 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kFetchedVars[] = "fetched_vars"; constexpr char kAllOpDescs[] = "all_op_descs";
constexpr char kGraphNodePool[] = "graph_node_pool";
// NOTE(dzh): Variable and the operators use the var. std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using GraphNodePool = std::vector<
std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). // NOTE(dzh): A ordered set for node reuse in memory optimize.
// in fluid, -1 means the batch_size is determined in runtime. // the orderedset sort node in ascend order(by node bytes size).
// the node batch_size equal -1 always ranking in the front than the node not. // in fluid, -1 means the batch_size, which is determined in runtime.
// So the reuse happens between nodes who's batch_size both are -1
// simultaneously or not.
//
// sort rule:
// rule 0 : smaller node ranking in front.
// rule 1 : batch_size equal -1 ranking in the front than the node not.
//
// For example, // For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class OrderedNodeList {
public:
using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
using Iter = typename std::list<NodePair>::iterator;
using ConstIter = typename std::list<NodePair>::const_iterator;
void Insert(ir::Node* var, ir::Node* op); class OrderedSet {
public:
// nodes with same name exists in pool.
using NodeVector = std::vector<ir::Node*>;
using Iter = typename std::list<NodeVector>::iterator;
using ConstIter = typename std::list<NodeVector>::const_iterator;
void Insert(ir::Node* var);
void Erase(ir::Node* var); void Erase(ir::Node* var);
bool Has(ir::Node* var) const;
void Erase(const std::string& var); void Clear() {
mark_table_.clear();
bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } nodes_.clear();
}
bool Has(const std::string& var) { return mark_table_.count(var); } // find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* NodeMatch(ir::Node* var) const;
// map store non-const iterator, can not promise const // map store non-const iterator, can not promise const
int GetIndex(ir::Node* var); int GetNodeIndexInPool(ir::Node* var);
// pool all node to string // pool all node to string
std::string ToString() const; std::string ToString() const;
...@@ -69,18 +71,54 @@ class OrderedNodeList { ...@@ -69,18 +71,54 @@ class OrderedNodeList {
Iter end() { return nodes_.end(); } Iter end() { return nodes_.end(); }
ConstIter begin() const { return nodes_.begin(); } ConstIter begin() const { return nodes_.begin(); }
ConstIter end() const { return nodes_.end(); } ConstIter end() const { return nodes_.end(); }
size_t size() const { return nodes_.size(); }
void Clear() { size_t size() const { return nodes_.size(); }
mark_table_.clear();
nodes_.clear();
}
private: private:
// for searching. // for searching.
std::unordered_map<std::string, Iter> mark_table_; std::unordered_map<std::string, Iter> mark_table_;
// node swap pairs. var -> ops dep var // node pool
std::list<NodePair> nodes_; std::list<NodeVector> nodes_;
};
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// IR Graph
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
}; };
// valid a tensor can be reuse or not // valid a tensor can be reuse or not
...@@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node); ...@@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node);
bool OpHasSubBlock(OpDesc* desc); bool OpHasSubBlock(OpDesc* desc);
// node memory size in bytes // node memory size in bytes
size_t NodeSizeInBytes(ir::Node* n); size_t NodeSize(ir::Node* n);
// node memory size in bytes // node memory size in bytes
size_t NodeSizeInBytes(const VarDesc&); size_t NodeSize(const VarDesc&);
std::string DebugString(ir::Node* var); std::string DebugString(ir::Node* var);
// NOTE(dzhwinter)
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc* FindVarDescInBlock(ir::Node* n); VarDesc* FindVarDescInBlock(ir::Node* n);
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
template <typename Container, typename Callback> template <typename Container, typename Callback>
class FilterVariableImpl { class FilterVariableImpl {
public: public:
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <iterator>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -22,13 +23,19 @@ ...@@ -22,13 +23,19 @@
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
TEST(OrderedNodeList, Normal) { TEST(OrderedSet, Normal) {
OrderedNodeList pool; OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes; std::vector<std::unique_ptr<ir::Node>> nodes;
// clang-format off // clang-format off
...@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) { ...@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
nodes.emplace_back(std::move(node)); nodes.emplace_back(std::move(node));
} }
// Insert
for (auto& node : nodes) { for (auto& node : nodes) {
pool.Insert(node.get(), op.get()); pool.Insert(node.get());
}
// Has/size
ASSERT_EQ(pool.size(), shapes.size());
for (auto& node : nodes) {
ASSERT_TRUE(pool.Has(node.get()));
} }
// assert its order and interface. // assert its order and interface.
...@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) { ...@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
std::cout << pool.ToString() << std::endl; std::cout << pool.ToString() << std::endl;
ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1)); ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);
{ {
auto v1 = block_desc->Var("11"); auto v1 = block_desc->Var("11");
v1->SetShape({-1, 256, 56, 56}); v1->SetShape({-1, 256, 56, 56});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(cache, nullptr); ASSERT_EQ(cache, nullptr);
} }
{ {
...@@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) { ...@@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) {
v2->SetShape({-1, 2, 5}); v2->SetShape({-1, 2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5]
} }
{ {
auto v3 = block_desc->Var("13"); auto v3 = block_desc->Var("13");
v3->SetShape({2, 5}); v3->SetShape({2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
}
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
} }
} }
......
...@@ -43,11 +43,6 @@ namespace paddle { ...@@ -43,11 +43,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
...@@ -77,7 +72,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -77,7 +72,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
skip_set_.count(var->Name())) skip_set_.count(var->Name()))
continue; continue;
ir::Node* cache = pool_.NodeMatch(var); ir::Node* cache = pool_.FindBestFitNode(var);
if (var->Name() == FLAGS_memory_optimize_debug) { if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op " VLOG(3) << "start match var " << DebugString(var) << " of op "
...@@ -95,11 +90,12 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -95,11 +90,12 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
<< "replace it again. Skip this candidate."; << "replace it again. Skip this candidate.";
continue; continue;
int node_idx_in_pool = pool_.GetIndex(cache); int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << string::Sprintf( VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d", "!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(reuse_id++), DebugString(var), DebugString(cache), std::to_string(reuse_id++), DebugString(var), DebugString(cache),
node_idx_in_pool, static_cast<int>(pool_.size())); node_idx_in_pool, static_cast<int>(pool_.size()));
// update CFG Graph on the fly. // update CFG Graph on the fly.
// reused var maybe re-fill into the pool // reused var maybe re-fill into the pool
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
...@@ -112,6 +108,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -112,6 +108,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
pool_.Erase(cache); pool_.Erase(cache);
} }
// fill the pool // fill the pool
std::unordered_set<std::string> unlived_vars; std::unordered_set<std::string> unlived_vars;
for (auto var : cfg_->LiveIn(op)) { for (auto var : cfg_->LiveIn(op)) {
...@@ -120,36 +117,15 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -120,36 +117,15 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
} }
for (auto var : unlived_vars) { for (auto var : unlived_vars) {
ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); ir::Node* var_node = cfg_->GetNodeByName(var, op);
if (NodeCanReused(var_node) && !pool_.Has(var_node)) { if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node, op); pool_.Insert(var_node);
} }
} }
} }
} }
graph->ResolveHazard(var_nodes_); graph->ResolveHazard(var_nodes_);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for (auto& op : graph->Nodes()) {
for (auto& var : op->inputs) {
if (pool_.Has(var)) {
pool_.Insert(var, op);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
for (auto it = pool_.begin(); it != pool_.end(); ++it) {
std::unordered_set<OpDesc*> descs;
for (auto& op : it->second) {
PADDLE_ENFORCE(op->IsOp());
descs.insert(op->Op());
}
graph_pool.push_back(std::make_pair(it->first->Name(), descs));
}
return graph; return graph;
} }
...@@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
PADDLE_ENFORCE(sub_op != nullptr); PADDLE_ENFORCE(sub_op != nullptr);
for (auto* var : sub_op->outputs) { for (auto* var : sub_op->outputs) {
if (NodeCanReused(var)) { if (NodeCanReused(var)) {
ir::Node* cache = pool_.NodeMatch(var); ir::Node* cache = pool_.FindBestFitNode(var);
if (cache != nullptr) { if (cache != nullptr) {
if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
continue; continue;
} }
int node_idx_in_pool = pool_.GetIndex(cache); int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << string::Sprintf( VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d", "!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(sub_reuse_id++), DebugString(var), std::to_string(sub_reuse_id++), DebugString(var),
...@@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
var_nodes_.at(var).clear(); var_nodes_.at(var).clear();
} }
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
if (output->Name() == name) {
found_node = output;
}
}
}
return found_node;
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
REGISTER_PASS(memory_optimize_pass, REGISTER_PASS(memory_optimize_pass,
paddle::framework::details::MemoryOptimizePass) paddle::framework::details::MemoryOptimizePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs); .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
...@@ -32,20 +32,15 @@ ...@@ -32,20 +32,15 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
class ControlFlowGraph;
class MemoryOptimizePass : public ir::Pass { class MemoryOptimizePass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override; std::unique_ptr<ir::Graph> graph) const override;
private:
// fill the variable map(var_nodes) by version. // fill the variable map(var_nodes) by version.
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
private:
// update program descs // update program descs
void RenameVarInGraphDesc(const std::string& var, void RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var, size_t idx) const; const std::string& cache_var, size_t idx) const;
...@@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass {
private: private:
// Reuse Node Pool, Owned. // Reuse Node Pool, Owned.
mutable OrderedNodeList pool_; mutable OrderedSet pool_;
// controlflow Graph // controlflow Graph
mutable std::unique_ptr<ControlFlowGraph> cfg_; mutable std::unique_ptr<ControlFlowGraph> cfg_;
// skip set // skip set
...@@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_; mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
}; };
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// For IR Graph in parallelexecutor
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( ...@@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
if (pool_) { if (pool_) {
run_futures.emplace_back(pool_->enqueue(std::move(call))); run_futures.emplace_back(pool_->enqueue(std::move(call)));
} else { } else {
fetch_data.emplace_back(std::move(call())); fetch_data.emplace_back(call());
} }
} }
...@@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( ...@@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
if (exception_holder_.IsCaught()) { if (exception_holder_.IsCaught()) {
f.wait(); f.wait();
} else { } else {
fetch_data.emplace_back(std::move(f.get())); fetch_data.emplace_back(f.get());
} }
} }
} }
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
......
...@@ -21,8 +21,6 @@ namespace paddle { ...@@ -21,8 +21,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
class SequentialExecutionPass : public ir::Pass { class SequentialExecutionPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
......
...@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, ...@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
// Since we want to fetch LodTensor from a variable, the variable must // Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly. // be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name); Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(), PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable", "Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name()); typeid(FeedFetchList).name());
......
...@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference { ...@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
return in.Name() != out.Name() && details::NodeCanReused(in) && return in.Name() != out.Name() && details::NodeCanReused(in) &&
details::NodeCanReused(out) && details::NodeCanReused(out) &&
details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); details::NodeSize(out) <= details::NodeSize(in);
} }
}; };
......
...@@ -76,7 +76,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram( ...@@ -76,7 +76,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
var->inputs.push_back(node); var->inputs.push_back(node);
} }
} }
return std::move(var_nodes); return var_nodes;
} }
void Graph::ResolveHazard( void Graph::ResolveHazard(
......
...@@ -142,7 +142,7 @@ class Graph { ...@@ -142,7 +142,7 @@ class Graph {
// TODO(panyx0718): control var name should be really unique. // TODO(panyx0718): control var name should be really unique.
const std::string name = string::Sprintf( const std::string name = string::Sprintf(
"%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName), "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
node_set_.size()); num_node_created_);
auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
return x; return x;
......
...@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase { ...@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
std::unordered_set<const Node*> invalid_nodes; std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0; int valid_op = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
PADDLE_ENFORCE_NOT_NULL(node);
if (is_valid_node(node)) { if (is_valid_node(node)) {
invalid_nodes.insert(node); invalid_nodes.insert(node);
} else if (node->IsOp()) { } else if (node->IsOp()) {
......
...@@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) { ...@@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
}; };
std::vector<std::string> concat_inputs; std::vector<std::string> concat_inputs;
for (int i = 0; i < num_inputs_of_concat; ++i) { for (int i = 0; i < num_inputs_of_concat; ++i) {
std::string prefix = "seqpool_op_" + i; std::string prefix = "seqpool_op_" + std::to_string(i);
new_var(prefix + "in"); new_var(prefix + "in");
new_var(prefix + "out"); new_var(prefix + "out");
new_var(prefix + "out_unused"); new_var(prefix + "out_unused");
......
...@@ -188,14 +188,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -188,14 +188,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} catch (platform::EnforceNotMet exception) { } catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) { if (Attrs().count("sub_block") != 0) {
throw exception; throw;
} }
auto& callstack = Attr<std::vector<std::string>>( auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) { if (callstack.empty()) {
throw exception; throw;
} }
std::ostringstream sout; std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n"; sout << "Invoke operator " << Type() << " error.\n";
...@@ -206,7 +206,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -206,7 +206,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
sout << "C++ Callstacks: \n"; sout << "C++ Callstacks: \n";
sout << exception.err_str_; sout << exception.err_str_;
exception.err_str_ = sout.str(); exception.err_str_ = sout.str();
throw exception; throw;
} catch (...) { } catch (...) {
std::rethrow_exception(std::current_exception()); std::rethrow_exception(std::current_exception());
} }
...@@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
public: public:
RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
const RuntimeContext& ctx) const RuntimeContext& ctx)
: op_(op), scope_(scope), ctx_(ctx) {} : op_(op), ctx_(ctx) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
// has only one input // has only one input
...@@ -881,7 +881,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -881,7 +881,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_;
const RuntimeContext& ctx_; const RuntimeContext& ctx_;
}; };
...@@ -990,11 +989,14 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -990,11 +989,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& transfer_scope) const { const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) { for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* origin_var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
var_name);
auto* original_tensor = auto* original_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name)); GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
auto* var = transfer_scope.FindVar(var_name); auto* var = transfer_scope.FindVar(var_name);
PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr", PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
var_name); var_name);
auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
original_tensor->ShareDataWith(*transformed_tensor); original_tensor->ShareDataWith(*transformed_tensor);
} }
......
...@@ -222,12 +222,7 @@ class ExecutionContext { ...@@ -222,12 +222,7 @@ class ExecutionContext {
if (it == ctx_.inputs.end()) { if (it == ctx_.inputs.end()) {
return {}; return {};
} }
std::vector<const Variable*> res; return {it->second.begin(), it->second.end()};
res.reserve(it->second.size());
std::transform(it->second.begin(), it->second.end(),
std::back_inserter(res),
[this](Variable* var) { return var; });
return res;
} }
std::vector<Variable*> MultiOutputVar(const std::string& name) const { std::vector<Variable*> MultiOutputVar(const std::string& name) const {
......
...@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( ...@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
graph = eager_deletion_pass->Apply(std::move(graph)); graph = eager_deletion_pass->Apply(std::move(graph));
VLOG(10) << "EagerDeletionPass Applied"; VLOG(10) << "EagerDeletionPass Applied";
if (build_strategy_.memory_early_delete_) {
auto early_delete_pass =
ir::PassRegistry::Instance().Get("memory_early_delete_pass");
early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
graph = early_delete_pass->Apply(std::move(graph));
}
VLOG(10) << "MemoryEarlyDeletePass Applied.";
} }
return graph; return graph;
...@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( ...@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
graphs.push_back(std::move(graph)); graphs.push_back(std::move(graph));
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold "
<< static_cast<float>(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
for (size_t i = 0; i < graphs.size(); ++i) { for (size_t i = 0; i < graphs.size(); ++i) {
graphs[i] = member_->PrepareGCAndRefCnts( graphs[i] = member_->PrepareGCAndRefCnts(
...@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(memory_early_delete_pass);
USE_PASS(reference_count_pass); USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass); USE_PASS(eager_deletion_pass);
...@@ -22,11 +22,7 @@ limitations under the License. */ ...@@ -22,11 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DEFINE_bool(benchmark, false, DECLARE_bool(benchmark);
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DEFINE_bool( DEFINE_bool(
eager_delete_scope, true, eager_delete_scope, true,
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
endif() endif()
...@@ -58,12 +58,13 @@ if(WIN32) ...@@ -58,12 +58,13 @@ if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
else(WIN32) else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config paddle_pass_builder)
endif() endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
......
...@@ -101,7 +101,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { ...@@ -101,7 +101,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
} }
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
} }
return std::move(graph); return graph;
} }
framework::proto::ProgramDesc IRPassManager::AcquireProgram( framework::proto::ProgramDesc IRPassManager::AcquireProgram(
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if(WITH_TESTING)
add_dependencies(subgraph_detector gtest)
endif()
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
......
...@@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_ ...@@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy) cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy) cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context)
function(anakin_target target_name) function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction() endfunction()
......
...@@ -421,7 +421,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -421,7 +421,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) { if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
return nullptr; return nullptr;
} }
return std::move(predictor); return predictor;
} }
void AnalysisPredictor::PrepareFeedFetch() { void AnalysisPredictor::PrepareFeedFetch() {
......
...@@ -16,6 +16,12 @@ ...@@ -16,6 +16,12 @@
/*! \file paddle_api.h /*! \file paddle_api.h
*/ */
/*! \mainpage Paddle Inference APIs
* \section intro_sec Introduction
* The Paddle inference library aims to offer an high performance inference SDK
* for Paddle users.
*/
#include <cassert> #include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -34,26 +40,49 @@ enum PaddleDType { ...@@ -34,26 +40,49 @@ enum PaddleDType {
}; };
/** /**
*\brief Memory menager for PaddleTensor. * \brief Memory manager for `PaddleTensor`.
* *
*The PaddleBuf holds a buffer for data input or output. The memory can be * The PaddleBuf holds a buffer for data input or output. The memory can be
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*should be reused for better performance. * should be reused for better performance.
* *
*For user allocated memory, the following API can be used: * For user allocated memory, the following API can be used:
*- PaddleBuf(void* data, size_t length) to set an external memory by * - PaddleBuf(void* data, size_t length) to set an external memory by
*specifying * specifying the memory address and length.
* the memory address and length. * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory. *memory.
*ATTENTION, for user allocated memory, deallocation should be done by users * ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation *externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation. *or deallocation.
* *
*To have the PaddleBuf allocate and manage the memory: * To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`. * - PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done. * if the allocated memory is larger than `length`, nothing will done.
*
* Usage:
*
* Let PaddleBuf manage the memory internally.
* \code{cpp}
* const int num_elements = 128;
* PaddleBuf buf(num_elements * sizeof(float));
* \endcode
*
* Or
* \code{cpp}
* PaddleBuf buf;
* buf.Resize(num_elements * sizeof(float));
* \endcode
* Works the exactly the same.
*
* One can also make the `PaddleBuf` use the external memory.
* \code{cpp}
* PaddleBuf buf;
* void* external_memory = new float[num_elements];
* buf.Reset(external_memory, num_elements*sizeof(float));
* ...
* delete[] external_memory; // manage the memory lifetime outside.
* \endcode
*/ */
class PaddleBuf { class PaddleBuf {
public: public:
...@@ -78,7 +107,7 @@ class PaddleBuf { ...@@ -78,7 +107,7 @@ class PaddleBuf {
/** Tell whether the buffer is empty. /** Tell whether the buffer is empty.
*/ */
bool empty() const { return length_ == 0; } bool empty() const { return length_ == 0; }
/** Get the memory address. /** Get the data's memory address.
*/ */
void* data() const { return data_; } void* data() const { return data_; }
/** Get the memory length. /** Get the memory length.
...@@ -110,7 +139,8 @@ struct PaddleTensor { ...@@ -110,7 +139,8 @@ struct PaddleTensor {
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
/** Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports `AnalysisPredictor`.
*/ */
class ZeroCopyTensor { class ZeroCopyTensor {
public: public:
...@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
* *
* Usage: * Usage:
* *
* \code{.cpp}
* NativeConfig config; * NativeConfig config;
* ... // change the configs. * ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config); * auto native_predictor = CreatePaddlePredictor(config);
* \endcode
* *
* FOR EXTENSION DEVELOPER: * FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be * Different predictors are designated by config type. Similar configs can be
......
...@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() { ...@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass); analysis_passes_.push_back(pass);
} }
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
} // namespace paddle } // namespace paddle
...@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
*/ */
class CpuPassStrategy : public PassStrategy { class CpuPassStrategy : public PassStrategy {
public: public:
CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy();
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
explicit CpuPassStrategy(const CpuPassStrategy &other) explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {}
...@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
*/ */
class GpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy {
public: public:
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy();
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
explicit GpuPassStrategy(const GpuPassStrategy &other) explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) { : PassStrategy(other.AllPasses()) {
......
...@@ -83,7 +83,7 @@ class ChunkedAllocator : public Allocator { ...@@ -83,7 +83,7 @@ class ChunkedAllocator : public Allocator {
VLOG(1) << "Create AutoIncrementAllocator with chunk_size " VLOG(1) << "Create AutoIncrementAllocator with chunk_size "
<< max_chunk_size_ << " and capacity " << capacity; << max_chunk_size_ << " and capacity " << capacity;
default_allocator_ = std::make_shared<AutoIncrementAllocator>( default_allocator_ = std::make_shared<AutoIncrementAllocator>(
[this] { return std::move(CreateAllocatorWithChunk()); }, capacity); [this] { return CreateAllocatorWithChunk(); }, capacity);
} }
} }
......
...@@ -111,6 +111,8 @@ size_t BestFitAllocator::NumFreeChunks() const { ...@@ -111,6 +111,8 @@ size_t BestFitAllocator::NumFreeChunks() const {
} }
void BestFitAllocator::Free(Allocation* allocation) { void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation); auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(bf_allocation,
"The input allocation is not BestFitAllocation.");
auto chunk_it = bf_allocation->ChunkIterator(); auto chunk_it = bf_allocation->ChunkIterator();
PADDLE_ENFORCE(!chunk_it->is_free); PADDLE_ENFORCE(!chunk_it->is_free);
chunk_it->is_free = true; chunk_it->is_free = true;
......
...@@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false, ...@@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value " "that initializing the allocated memory with a small value "
"during unit testing."); "during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_bool(benchmark);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -198,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -198,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place)); << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} else { } else {
if (VLOG_IS_ON(3)) { if (FLAGS_benchmark) {
allocation::GPUMemMonitor.Add(place.device, size); allocation::GPUMemMonitor.Add(place.device, size);
} }
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
...@@ -216,7 +217,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, ...@@ -216,7 +217,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
if (VLOG_IS_ON(3)) { if (FLAGS_benchmark) {
allocation::GPUMemMonitor.Minus(place.device, size); allocation::GPUMemMonitor.Minus(place.device, size);
} }
#else #else
...@@ -257,7 +258,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -257,7 +258,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *ptr = buddy_allocator->Alloc(size); void *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) { if (ptr == nullptr) {
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size
<< " bytes in CUDAPinnedPlace"; << " bytes in CUDAPinnedPlace";
} }
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
......
...@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, ...@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
// "CPUPinnedAllocator should be used for Cross-Device Communication"); // "CPUPinnedAllocator should be used for Cross-Device Communication");
void *ptr; void *ptr;
PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
return new CPUPinnedAllocation(ptr, size); return new CPUPinnedAllocation(ptr, size);
} }
} // namespace allocation } // namespace allocation
......
...@@ -19,7 +19,7 @@ namespace paddle { ...@@ -19,7 +19,7 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// Allocator uses `cudaMallocHost` // Allocator uses `cudaHostAlloc`
class CPUPinnedAllocation : public Allocation { class CPUPinnedAllocation : public Allocation {
public: public:
CPUPinnedAllocation(void *ptr, size_t size) CPUPinnedAllocation(void *ptr, size_t size)
......
...@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void* p; void* p;
// PINNED memory is visible to all CUDA contexts. // PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size); cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
if (result == cudaSuccess) { if (result == cudaSuccess) {
*index = 1; // PINNED memory *index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
return p; return p;
} else { } else {
LOG(WARNING) << "cudaMallocHost failed."; LOG(WARNING) << "cudaHostAlloc failed.";
return nullptr; return nullptr;
} }
......
...@@ -37,7 +37,7 @@ using paddle::framework::Tensor; ...@@ -37,7 +37,7 @@ using paddle::framework::Tensor;
"(bool, default false) Set to true for inference only, false " \ "(bool, default false) Set to true for inference only, false " \
"for training. Some layers may run faster when this is true.") \ "for training. Some layers may run faster when this is true.") \
.SetDefault(false); \ .SetDefault(false); \
AddComment(#OP_COMMENT); \ AddComment(OP_COMMENT); \
} \ } \
} }
...@@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel { ...@@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
UNUSED constexpr char SigmoidDoc[] = R"DOC( UNUSED constexpr char SigmoidDoc[] = R"DOC(
Sigmoid Activation Operator Sigmoid Activation Operator
$$out = \frac{1}{1 + e^{-x}}$$ $$out = \\frac{1}{1 + e^{-x}}$$
)DOC"; )DOC";
...@@ -187,14 +187,14 @@ $out = |x|$ ...@@ -187,14 +187,14 @@ $out = |x|$
UNUSED constexpr char CeilDoc[] = R"DOC( UNUSED constexpr char CeilDoc[] = R"DOC(
Ceil Activation Operator. Ceil Activation Operator.
$out = ceil(x)$ $out = \left \lceil x \right \rceil$
)DOC"; )DOC";
UNUSED constexpr char FloorDoc[] = R"DOC( UNUSED constexpr char FloorDoc[] = R"DOC(
Floor Activation Operator. Floor Activation Operator.
$out = floor(x)$ $out = \left \lfloor x \right \rfloor$
)DOC"; )DOC";
...@@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$ ...@@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$
UNUSED constexpr char SoftsignDoc[] = R"DOC( UNUSED constexpr char SoftsignDoc[] = R"DOC(
Softsign Activation Operator. Softsign Activation Operator.
$$out = \frac{x}{1 + |x|}$$ $$out = \\frac{x}{1 + \|x\|}$$
)DOC"; )DOC";
......
...@@ -222,7 +222,7 @@ void Conv2DOpMaker::Make() { ...@@ -222,7 +222,7 @@ void Conv2DOpMaker::Make() {
.SetDefault(4096); .SetDefault(4096);
AddAttr<bool>("exhaustive_search", AddAttr<bool>("exhaustive_search",
"(bool, default false) cuDNN has many algorithm to calculation " "(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search ", "convolution, whether enable exhaustive search "
"for cuDNN convolution or not, defalut is False.") "for cuDNN convolution or not, defalut is False.")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
...@@ -341,7 +341,7 @@ void Conv3DOpMaker::Make() { ...@@ -341,7 +341,7 @@ void Conv3DOpMaker::Make() {
.SetDefault(4096); .SetDefault(4096);
AddAttr<bool>("exhaustive_search", AddAttr<bool>("exhaustive_search",
"(bool, default false) cuDNN has many algorithm to calculation " "(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search ", "convolution, whether enable exhaustive search "
"for cuDNN convolution or not, defalut is False.") "for cuDNN convolution or not, defalut is False.")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
......
...@@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel { ...@@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel {
"The shape of PriorBox is [N, 4]"); "The shape of PriorBox is [N, 4]");
if (ctx->HasInput("PriorBoxVar")) { if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE( PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, "Input(PriorBoxVar) of BoxCoderOp should be 2.");
"Input(PriorBoxVar) of BoxCoderOp should be 1 or 2."); PADDLE_ENFORCE_EQ(
if (prior_box_var_dims.size() == 1) { prior_box_dims, prior_box_var_dims,
PADDLE_ENFORCE_EQ( "The dimension of Input(PriorBoxVar) should be equal to"
prior_box_var_dims[0], 4, "the dimension of Input(PriorBox) when the rank is 2.");
"The 1st dimension of Input(PriorBoxVar) should be 4"
"when the rank is 1.");
} else {
PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox when the rank is 2.)");
}
} }
} }
......
...@@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel( ...@@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel(
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
if (prior_box_var_data) { if (prior_box_var_data) {
int prior_var_offset = 0; int prior_var_offset = col_idx * len;
if (prior_box_var_size == 2) {
prior_var_offset = col_idx * len;
}
output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len] /= prior_box_var_data[prior_var_offset];
output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
...@@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel( ...@@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel(
T box_var_x = T(1), box_var_y = T(1); T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1); T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var_data) { if (prior_box_var_data) {
int prior_var_offset = 0; int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
if (prior_box_var_size == 2) {
prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
}
box_var_x = prior_box_var_data[prior_var_offset]; box_var_x = prior_box_var_data[prior_var_offset];
box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2]; box_var_w = prior_box_var_data[prior_var_offset + 2];
......
...@@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) { if (prior_box_var) {
int prior_var_offset = 0; int prior_var_offset = j * len;
if (prior_box_var->dims().size() == 2) {
prior_var_offset = j * len;
}
output[offset] /= prior_box_var_data[prior_var_offset]; output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
...@@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> {
} }
} }
} }
template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box, const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor* prior_box_var,
const bool normalized, const int axis, const bool normalized, std::vector<float> variance,
const std::vector<float> variance, T* output) const { T* output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1]; int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2]; int64_t len = target_box->dims()[2];
...@@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> {
auto* target_box_data = target_box->data<T>(); auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>(); auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr; const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>(); if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0; int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
if (axis == 0) { prior_box_offset = axis == 0 ? j * len : i * len;
prior_box_offset = j * len;
} else if (axis == 1) {
prior_box_offset = i * len;
}
T prior_box_width = prior_box_data[prior_box_offset + 2] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + prior_box_data[prior_box_offset] +
(normalized == false); (normalized == false);
...@@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_center_x = 0, target_box_center_y = 0; T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
T box_var_x = T(1), box_var_y = T(1); int prior_var_offset = axis == 0 ? j * len : i * len;
T box_var_w = T(1), box_var_h = T(1); if (var_size == 2) {
if (prior_box_var) { std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
int prior_var_offset = 0; 4 * sizeof(T));
if (prior_box_var->dims().size() == 2) { } else if (var_size == 1) {
if (axis == 0) var_ptr = reinterpret_cast<T*>(variance.data());
prior_var_offset = j * len;
else if (axis == 1)
prior_var_offset = i * len;
}
box_var_x = prior_box_var_data[prior_var_offset];
box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2];
box_var_h = prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
box_var_x = static_cast<T>(variance[0]);
box_var_y = static_cast<T>(variance[1]);
box_var_w = static_cast<T>(variance[2]);
box_var_h = static_cast<T>(variance[3]);
} }
T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1);
T box_var_w = *(var_ptr + 2);
T box_var_h = *(var_ptr + 3);
target_box_center_x = target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width + box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x; prior_box_center_x;
...@@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output); variance, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) { } else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, if (prior_box_var) {
variance, output); if (axis == 0) {
DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else if (!(variance.empty())) {
if (axis == 0) {
DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else {
if (axis == 0) {
DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
}
} }
} }
}; };
......
...@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { ...@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
} }
}; };
class ElementwiseGradOpInplace : public framework::InplaceInToOut {
public:
using framework::InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> ret;
if (block->HasVar(framework::GradVarName("X")) &&
block->HasVar(framework::GradVarName("Out"))) {
ret[framework::GradVarName("Out")] = framework::GradVarName("X");
}
return ret;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { ...@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
op_type##GradMaker, \ op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \ ::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \ REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad) ::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
...@@ -146,7 +146,11 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, ...@@ -146,7 +146,11 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>); expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
ops::ExpandKernel<paddle::platform::CPUDeviceContext, double>,
ops::ExpandKernel<paddle::platform::CPUDeviceContext, int>,
ops::ExpandKernel<paddle::platform::CPUDeviceContext, bool>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand_grad, expand_grad,
ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>); ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -15,7 +15,11 @@ limitations under the License. */ ...@@ -15,7 +15,11 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>); expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
expand_grad, expand_grad,
ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>); ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -21,26 +21,17 @@ limitations under the License. */ ...@@ -21,26 +21,17 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, int MajorType = Eigen::RowMajor, template <typename T>
typename IndexType = Eigen::DenseIndex> struct Compare {
using EigenVectorArrayMap = public:
Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>; bool operator()(const T a, const T b) { return (std::abs(a) < std::abs(b)); }
};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using ConstEigenVectorArrayMap =
Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
template <typename T> template <typename T>
struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> { struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, const T* in, void operator()(const platform::CPUDeviceContext& ctx, const T* in,
const int num, T* out) { const int num, T* out) {
Eigen::DSizes<Eigen::DenseIndex, 1> idim(num); *out = *(std::max_element(in + 0, in + num, Compare<T>()));
Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
out_e = in_e.abs().maximum();
} }
}; };
......
...@@ -63,7 +63,6 @@ class VActFunc : public JitCode { ...@@ -63,7 +63,6 @@ class VActFunc : public JitCode {
public: public:
explicit VActFunc(size_t code_size, void* code_ptr) explicit VActFunc(size_t code_size, void* code_ptr)
: JitCode(code_size, code_ptr) {} : JitCode(code_size, code_ptr) {}
virtual const char* name() const = 0;
virtual void genCode() = 0; virtual void genCode() = 0;
protected: protected:
...@@ -269,7 +268,7 @@ class VActJitCode : public VActFunc { ...@@ -269,7 +268,7 @@ class VActJitCode : public VActFunc {
this->genCode(); this->genCode();
} }
const char* name() const override { std::string name() const override {
std::string base = "VActJitCode"; std::string base = "VActJitCode";
switch (type_) { switch (type_) {
case operand_type::RELU: case operand_type::RELU:
...@@ -293,7 +292,7 @@ class VActJitCode : public VActFunc { ...@@ -293,7 +292,7 @@ class VActJitCode : public VActFunc {
default: default:
break; break;
} }
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -41,7 +41,7 @@ class VXXJitCode : public JitCode { ...@@ -41,7 +41,7 @@ class VXXJitCode : public JitCode {
this->genCode(); this->genCode();
} }
virtual const char* name() const { std::string name() const override {
std::string base = "VXXJitCode"; std::string base = "VXXJitCode";
if (scalar_index_ == 1) { if (scalar_index_ == 1) {
base += "_Scalar"; base += "_Scalar";
...@@ -62,7 +62,7 @@ class VXXJitCode : public JitCode { ...@@ -62,7 +62,7 @@ class VXXJitCode : public JitCode {
} }
base += (with_relu_ ? "_Relu" : ""); base += (with_relu_ ? "_Relu" : "");
base += "_D" + std::to_string(num_); base += "_D" + std::to_string(num_);
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -49,7 +49,7 @@ class GRUJitCode : public VActFunc { ...@@ -49,7 +49,7 @@ class GRUJitCode : public VActFunc {
this->genCode(); this->genCode();
} }
const char* name() const override { std::string name() const override {
std::string base = "GRUJitCode"; std::string base = "GRUJitCode";
if (id_ == 0) { if (id_ == 0) {
base += "_H1"; base += "_H1";
...@@ -81,7 +81,7 @@ class GRUJitCode : public VActFunc { ...@@ -81,7 +81,7 @@ class GRUJitCode : public VActFunc {
}; };
AddTypeStr(act_gate_); AddTypeStr(act_gate_);
AddTypeStr(act_cand_); AddTypeStr(act_cand_);
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -35,14 +35,14 @@ class HOPVJitCode : public JitCode { ...@@ -35,14 +35,14 @@ class HOPVJitCode : public JitCode {
this->genCode(); this->genCode();
} }
virtual const char* name() const { std::string name() const override {
std::string base = "VXXJitCode"; std::string base = "VXXJitCode";
if (type_ == operand_type::MAX) { if (type_ == operand_type::MAX) {
base += "_MAX"; base += "_MAX";
} else { } else {
base += "_SUM"; base += "_SUM";
} }
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <string>
#include <type_traits> #include <type_traits>
#include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -59,7 +60,7 @@ typedef enum { ...@@ -59,7 +60,7 @@ typedef enum {
} operand_type; } operand_type;
#define DECLARE_JIT_CODE(codename) \ #define DECLARE_JIT_CODE(codename) \
const char* name() const override { return #codename; } std::string name() const override { return #codename; }
class JitCode : public GenBase, public Xbyak::CodeGenerator { class JitCode : public GenBase, public Xbyak::CodeGenerator {
public: public:
...@@ -68,7 +69,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { ...@@ -68,7 +69,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
(code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size), (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
code_ptr) {} code_ptr) {}
virtual const char* name() const = 0;
virtual void genCode() = 0; virtual void genCode() = 0;
size_t getSize() const override { return CodeGenerator::getSize(); } size_t getSize() const override { return CodeGenerator::getSize(); }
......
...@@ -53,7 +53,7 @@ class LSTMJitCode : public VActFunc { ...@@ -53,7 +53,7 @@ class LSTMJitCode : public VActFunc {
this->genCode(); this->genCode();
} }
const char* name() const override { std::string name() const override {
std::string base = "LSTMJitCode"; std::string base = "LSTMJitCode";
if (use_peephole_) { if (use_peephole_) {
base += "_Peephole"; base += "_Peephole";
...@@ -85,7 +85,7 @@ class LSTMJitCode : public VActFunc { ...@@ -85,7 +85,7 @@ class LSTMJitCode : public VActFunc {
AddTypeStr(act_gate_); AddTypeStr(act_gate_);
AddTypeStr(act_cand_); AddTypeStr(act_cand_);
AddTypeStr(act_cell_); AddTypeStr(act_cell_);
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -36,11 +36,11 @@ class MatMulJitCode : public JitCode { ...@@ -36,11 +36,11 @@ class MatMulJitCode : public JitCode {
this->genCode(); this->genCode();
} }
virtual const char* name() const { std::string name() const override {
std::string base = "MatMulJitCode"; std::string base = "MatMulJitCode";
base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
std::to_string(k_); std::to_string(k_);
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode { ...@@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode {
this->genCode(); this->genCode();
} }
virtual const char* name() const { std::string name() const override {
std::string base = "SeqPoolJitCode"; std::string base = "SeqPoolJitCode";
if (type_ == SeqPoolType::kSum) { if (type_ == SeqPoolType::kSum) {
base += "_Sum"; base += "_Sum";
...@@ -48,7 +48,7 @@ class SeqPoolJitCode : public JitCode { ...@@ -48,7 +48,7 @@ class SeqPoolJitCode : public JitCode {
base += "_Sqrt"; base += "_Sqrt";
} }
base += ("_W" + std::to_string(w_)); base += ("_W" + std::to_string(w_));
return base.c_str(); return base;
} }
void genCode() override; void genCode() override;
......
...@@ -17,7 +17,13 @@ ...@@ -17,7 +17,13 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <vector> #include <vector>
#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifndef _WIN32
#define posix_memalign_free free
#endif
DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
...@@ -40,6 +46,17 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -40,6 +46,17 @@ void GenBase::dumpCode(const unsigned char* code) const {
} }
} }
void* GenBase::operator new(size_t size) {
void* ptr;
constexpr size_t alignment = 32ul;
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
"GenBase Alloc %ld error!", size);
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
return ptr;
}
void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) { std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
int block; int block;
int max_num_regs; int max_num_regs;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
...@@ -28,7 +29,7 @@ namespace jit { ...@@ -28,7 +29,7 @@ namespace jit {
class GenBase : public Kernel { class GenBase : public Kernel {
public: public:
virtual ~GenBase() = default; virtual ~GenBase() = default;
virtual const char* name() const = 0; virtual std::string name() const = 0;
virtual size_t getSize() const = 0; virtual size_t getSize() const = 0;
virtual const unsigned char* getCodeInternal() = 0; virtual const unsigned char* getCodeInternal() = 0;
template <typename Func> template <typename Func>
...@@ -42,6 +43,11 @@ class GenBase : public Kernel { ...@@ -42,6 +43,11 @@ class GenBase : public Kernel {
return reinterpret_cast<Func>(const_cast<unsigned char*>(code)); return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
} }
void* operator new(size_t size);
void operator delete(void* ptr);
void* operator new[](size_t size) { return operator new(size); }
void operator delete[](void* ptr) { operator delete(ptr); }
protected: protected:
void dumpCode(const unsigned char* code) const; void dumpCode(const unsigned char* code) const;
}; };
......
...@@ -129,6 +129,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -129,6 +129,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
"must be either LoDTensor or SelectedRows"); "must be either LoDTensor or SelectedRows");
} }
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
bool is_sparse = context.Attr<bool>("is_sparse"); bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of // Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward. // paddings makes no sense and we don't deal with it in backward.
...@@ -187,10 +188,15 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -187,10 +188,15 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
memset(d_table_data, 0, d_table->numel() * sizeof(T)); memset(d_table_data, 0, d_table->numel() * sizeof(T));
for (int64_t i = 0; i < ids->numel(); ++i) { for (int64_t i = 0; i < ids->numel(); ++i) {
PADDLE_ENFORCE_LT(ids_data[i], N); if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
PADDLE_ENFORCE_GE(ids_data[i], 0); // the gradient of padding_idx should be 0, already done by memset, so
for (int j = 0; j < D; ++j) { // do nothing.
d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; } else {
PADDLE_ENFORCE_LT(ids_data[i], N);
PADDLE_ENFORCE_GE(ids_data[i], 0);
for (int j = 0; j < D; ++j) {
d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
}
} }
} }
} }
......
...@@ -37,7 +37,7 @@ math_library(concat_and_split) ...@@ -37,7 +37,7 @@ math_library(concat_and_split)
math_library(context_project DEPS im2col math_function) math_library(context_project DEPS im2col math_function)
math_library(cross_entropy) math_library(cross_entropy)
math_library(cos_sim_functor) math_library(cos_sim_functor)
math_library(depthwise_conv) math_library(depthwise_conv DEPS cub)
math_library(im2col) math_library(im2col)
math_library(sampler) math_library(sampler)
......
...@@ -282,7 +282,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -282,7 +282,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
? mkldnn::inner_product_backward_weights::desc( ? mkldnn::inner_product_backward_weights::desc(
src, diff_weights, bias, diff_dst) src, diff_weights, bias, diff_dst)
: mkldnn::inner_product_backward_weights::desc( : mkldnn::inner_product_backward_weights::desc(
src, diff_weights, bias, diff_dst); src, diff_weights, diff_dst);
return mkldnn::inner_product_backward_weights::primitive_desc( return mkldnn::inner_product_backward_weights::primitive_desc(
bwd_weight_desc, engine, pd); bwd_weight_desc, engine, pd);
......
...@@ -34,6 +34,8 @@ std::map<std::string, ...@@ -34,6 +34,8 @@ std::map<std::string,
{"accuracy", NG_OPS::BuildAccuracyNode}, {"accuracy", NG_OPS::BuildAccuracyNode},
{"conv2d", NG_OPS::BuildConv2dNode}, {"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode}, {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"batch_norm", NG_OPS::BuildBatchNormNode},
{"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
{"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
{"fill_constant", NG_OPS::BuildFillConstantNode}, {"fill_constant", NG_OPS::BuildFillConstantNode},
...@@ -46,8 +48,12 @@ std::map<std::string, ...@@ -46,8 +48,12 @@ std::map<std::string,
{"softmax", NG_OPS::BuildSoftmaxNode}, {"softmax", NG_OPS::BuildSoftmaxNode},
{"softmax_grad", NG_OPS::BuildSoftmaxGradNode}, {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
{"scale", NG_OPS::BuildScaleNode}, {"scale", NG_OPS::BuildScaleNode},
{"sigmoid", NG_OPS::BuildUnaryNode<ngraph::op::Sigmoid>},
{"sum", NG_OPS::BuildSumNode},
{"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>}, {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
{"relu_grad", NG_OPS::BuildReluGradNode},
{"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>}, {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
{"tanh_grad", NG_OPS::BuildTanhGradNode},
{"top_k", NG_OPS::BuildTopKNode}}; {"top_k", NG_OPS::BuildTopKNode}};
void NgraphBridge::BuildNgNode( void NgraphBridge::BuildNgNode(
......
...@@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
framework::OpKernelType kt = framework::OpKernelType( framework::OpKernelType kt = framework::OpKernelType(
framework::proto::VarType::FP32, ctx.GetPlace()); framework::proto::VarType::FP32, platform::CPUPlace());
return kt; return kt;
} }
}; };
......
...@@ -22,6 +22,8 @@ limitations under the License. */ ...@@ -22,6 +22,8 @@ limitations under the License. */
#pragma once #pragma once
#include "ops/accuracy_op.h" #include "ops/accuracy_op.h"
#include "ops/activation_op.h"
#include "ops/batch_norm_op.h"
#include "ops/binary_unary_op.h" #include "ops/binary_unary_op.h"
#include "ops/conv2d_op.h" #include "ops/conv2d_op.h"
#include "ops/elementwise_add_op.h" #include "ops/elementwise_add_op.h"
...@@ -31,4 +33,5 @@ limitations under the License. */ ...@@ -31,4 +33,5 @@ limitations under the License. */
#include "ops/pool2d_op.h" #include "ops/pool2d_op.h"
#include "ops/scale_op.h" #include "ops/scale_op.h"
#include "ops/softmax_op.h" #include "ops/softmax_op.h"
#include "ops/sum_op.h"
#include "ops/top_k_op.h" #include "ops/top_k_op.h"
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildReluGradNode(
const std::shared_ptr<framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto out = platform::GetInputNode(op, "Out", ngb_node_map);
auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto relu_grad = std::make_shared<ngraph::op::ReluBackprop>(out, dout);
platform::SetOutputNode(op, "X@GRAD", relu_grad, ngb_node_map);
}
void BuildTanhGradNode(
const std::shared_ptr<framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto out = platform::GetInputNode(op, "Out", ngb_node_map);
auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto shape = out->get_shape();
auto node_const =
ngraph::op::Constant::create(ngraph::element::f32, shape, {1});
auto result = dout * (node_const - out * out);
platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildBatchNormNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto& data_layout = op_attrs.Get<std::string>("data_layout");
auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map);
auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map);
auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
const bool is_test = op_attrs.Get<bool>("is_test");
const float epsilon = op_attrs.Get<float>("epsilon");
const float momentum = op_attrs.Get<float>("momentum");
if (data_layout == "NHWC") {
x = paddle::platform::Nhwc2Nchw(x);
}
std::shared_ptr<ngraph::Node> mean_out, saved_mean, saved_variance,
variance_out, y;
if (!is_test) {
auto BN = std::make_shared<ngraph::op::BatchNormTraining>(epsilon, scale,
bias, x);
y = std::make_shared<ngraph::op::GetOutputElement>(BN, 0);
saved_mean = std::make_shared<ngraph::op::GetOutputElement>(BN, 1);
saved_variance = std::make_shared<ngraph::op::GetOutputElement>(BN, 2);
mean_out = std::make_shared<ngraph::op::Add>(
paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
momentum, mean),
paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
1. - momentum, saved_mean));
variance_out = std::make_shared<ngraph::op::Add>(
paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
momentum, variance),
paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
1. - momentum, saved_variance));
if (data_layout == "NHWC") {
y = paddle::platform::Nchw2Nhwc(y);
}
paddle::platform::SetOutputNode(op, "MeanOut", mean_out, ngb_node_map);
paddle::platform::SetOutputNode(op, "VarianceOut", variance_out,
ngb_node_map);
paddle::platform::SetOutputNode(op, "SavedMean", saved_mean, ngb_node_map);
paddle::platform::SetOutputNode(op, "SavedVariance", saved_variance,
ngb_node_map);
paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
} else {
y = std::make_shared<ngraph::op::BatchNormInference>(epsilon, scale, bias,
x, mean, variance);
paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
}
}
void BuildBatchNormGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto& data_layout = op_attrs.Get<std::string>("data_layout");
auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
auto saved_mean =
paddle::platform::GetInputNode(op, "SavedMean", ngb_node_map);
auto saved_variance =
paddle::platform::GetInputNode(op, "SavedVariance", ngb_node_map);
auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
auto x_shape = x->get_shape();
auto dy_shape = dy->get_shape();
PADDLE_ENFORCE(x_shape.size() == 2 || x_shape.size() == 4,
"BN grap input size needs to be 2 or 4");
PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(),
"BN grap input and delta size needs to be equal");
if (x_shape.size() == 2) {
x = std::make_shared<ngraph::op::Reshape>(
x, ngraph::AxisVector{0, 1},
ngraph::Shape{x_shape.at(0), x_shape.at(1), 1, 1});
dy = std::make_shared<ngraph::op::Reshape>(
dy, ngraph::AxisVector{0, 1},
ngraph::Shape{dy_shape.at(0), dy_shape.at(1), 1, 1});
}
if (data_layout == "NHWC") {
x = paddle::platform::Nhwc2Nchw(dy);
dy = paddle::platform::Nhwc2Nchw(dy);
}
const float epsilon = op_attrs.Get<float>("epsilon");
auto bn_bprop = std::make_shared<ngraph::op::BatchNormTrainingBackprop>(
epsilon, scale, bias, x, saved_mean, saved_variance, dy);
std::shared_ptr<ngraph::Node> dx =
std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 0);
auto dscale = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 1);
auto dbias = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 2);
paddle::platform::SetOutputNode(op, "Bias@GRAD", dbias, ngb_node_map);
paddle::platform::SetOutputNode(op, "Scale@GRAD", dscale, ngb_node_map);
if (x_shape.size() == 2) {
paddle::platform::SetOutputNode(
op, "X@GRAD", paddle::platform::NgReshaper(dx, x_shape), ngb_node_map);
} else {
if (data_layout == "NHWC") {
dx = paddle::platform::Nchw2Nhwc(dx);
}
paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
}
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildSumNode(
const std::shared_ptr<framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
std::vector<std::string> op_inputs;
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
op_inputs.push_back(var_name);
if (ngb_node_map->find(var_name) == ngb_node_map->end()) {
PADDLE_THROW("op % input varname %s is not found in var_node_map",
op->Type(), var_name);
}
}
}
std::shared_ptr<ngraph::Node>& sum = ngb_node_map->at(op_inputs[0]);
for (size_t k = 1; k < op_inputs.size(); ++k) {
std::shared_ptr<ngraph::Node>& nodek = ngb_node_map->at(op_inputs[k]);
if (nodek->get_element_type() != sum->get_element_type()) {
nodek =
std::make_shared<ngraph::op::Convert>(nodek, sum->get_element_type());
}
sum = sum + nodek;
}
platform::SetOutputNode(op, "Out", sum, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
...@@ -259,7 +259,7 @@ Example: ...@@ -259,7 +259,7 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$ $$
For exclusive = true: For exclusive = false:
$$ $$
hstart = i * strides[0] - paddings[0] hstart = i * strides[0] - paddings[0]
hend = hstart + ksize[0] hend = hstart + ksize[0]
...@@ -267,7 +267,7 @@ Example: ...@@ -267,7 +267,7 @@ Example:
wend = wstart + ksize[1] wend = wstart + ksize[1]
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$ $$
For exclusive = false: For exclusive = true:
$$ $$
hstart = max(0, i * strides[0] - paddings[0]) hstart = max(0, i * strides[0] - paddings[0])
hend = min(H, hstart + ksize[0]) hend = min(H, hstart + ksize[0])
...@@ -403,7 +403,7 @@ Example: ...@@ -403,7 +403,7 @@ Example:
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$ $$
For exclusive = true: For exclusive = false:
$$ $$
dstart = i * strides[0] - paddings[0] dstart = i * strides[0] - paddings[0]
dend = dstart + ksize[0] dend = dstart + ksize[0]
...@@ -413,7 +413,7 @@ Example: ...@@ -413,7 +413,7 @@ Example:
wend = wstart + ksize[2] wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$ $$
For exclusive = false: For exclusive = true:
$$ $$
dstart = max(0, i * strides[0] - paddings[0]) dstart = max(0, i * strides[0] - paddings[0])
dend = min(D, dstart + ksize[0]) dend = min(D, dstart + ksize[0])
......
...@@ -121,7 +121,7 @@ struct RandomCropFunctor { ...@@ -121,7 +121,7 @@ struct RandomCropFunctor {
HOSTDEVICE void operator()(size_t ins_idx) { HOSTDEVICE void operator()(size_t ins_idx) {
typename Random<DeviceContext>::Engine engine(seed_); typename Random<DeviceContext>::Engine engine(seed_);
engine.discard(ins_idx * (rank_ - num_batchsize_dims_)); engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
size_t offsets[9]; size_t offsets[9] = {};
for (int i = num_batchsize_dims_; i < rank_; ++i) { for (int i = num_batchsize_dims_; i < rank_; ++i) {
typename Random<DeviceContext>::template UniformIntDist<size_t> dist( typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
0, x_dims_[i] - out_dims_[i]); 0, x_dims_[i] - out_dims_[i]);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h"
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() { ...@@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() {
position_.front().wait(); position_.front().wait();
position_.pop(); position_.pop();
} }
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamDestroy(stream));
for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
}
#endif
} }
BufferedReader::BufferedReader( BufferedReader::BufferedReader(
...@@ -33,6 +41,19 @@ BufferedReader::BufferedReader( ...@@ -33,6 +41,19 @@ BufferedReader::BufferedReader(
thread_pool_(1), thread_pool_(1),
place_(place), place_(place),
buffer_size_(buffer_size) { buffer_size_(buffer_size) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
compute_stream =
((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events.resize(buffer_size);
for (auto &event : events)
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
}
#endif
cpu_buffer_.resize(buffer_size); cpu_buffer_.resize(buffer_size);
gpu_buffer_.resize(buffer_size); gpu_buffer_.resize(buffer_size);
ReadTillBufferFullAsync(); ReadTillBufferFullAsync();
...@@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() { ...@@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() {
} }
void BufferedReader::ReadAsync(size_t i) { void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
}
#endif
position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
TensorVec &cpu = cpu_buffer_[i]; TensorVec &cpu = cpu_buffer_[i];
reader_->ReadNext(&cpu); reader_->ReadNext(&cpu);
...@@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) {
return -1UL; return -1UL;
} }
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
TensorVec &gpu = gpu_buffer_[i]; TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size()); gpu.resize(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
framework::TensorCopySync(cpu[i], place_, &gpu[i]); gpu[i].Resize(cpu[i].dims());
gpu[i].set_layout(cpu[i].layout());
auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data<void>();
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
if (platform::is_cuda_pinned_place(cpu_place))
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPinnedPlace>(cpu_place),
cpu_ptr, size, stream);
else if ((platform::is_gpu_place(cpu_place)))
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream);
else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
0);
gpu[i].set_lod(cpu[i].lod()); gpu[i].set_lod(cpu[i].lod());
} }
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
} }
#endif
return i; return i;
})); }));
} }
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#include <vector> #include <vector>
#include "ThreadPool.h" #include "ThreadPool.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader { ...@@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader {
std::vector<TensorVec> cpu_buffer_; std::vector<TensorVec> cpu_buffer_;
std::vector<TensorVec> gpu_buffer_; std::vector<TensorVec> gpu_buffer_;
size_t prev_pos_{-1UL}; size_t prev_pos_{-1UL};
#ifdef PADDLE_WITH_CUDA
cudaStream_t stream;
cudaStream_t compute_stream;
std::vector<cudaEvent_t> events;
#endif
}; };
} // namespace reader } // namespace reader
......
...@@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader, ...@@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
framework::LoD lod{lod_data}; framework::LoD lod{lod_data};
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
int64_t* tensor_data = lod_tensor.mutable_data<int64_t>( int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}), framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
platform::CPUPlace()); platform::CPUPlace());
memcpy(tensor_data, batch_feasign.data(), memcpy(tensor_data, batch_feasign.data(),
batch_feasign.size() * sizeof(int64_t)); batch_feasign.size() * sizeof(int64_t));
...@@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader, ...@@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
// insert label tensor // insert label tensor
framework::LoDTensor label_tensor; framework::LoDTensor label_tensor;
auto* label_tensor_data = label_tensor.mutable_data<int64_t>( auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}), framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
platform::CPUPlace()); platform::CPUPlace());
memcpy(label_tensor_data, batch_label.data(), memcpy(label_tensor_data, batch_label.data(),
batch_label.size() * sizeof(int64_t)); batch_label.size() * sizeof(int64_t));
......
...@@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { ...@@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) {
std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3, std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3,
b4}; b4};
std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; std::vector<DDim> label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}};
LoDTensorBlockingQueueHolder queue_holder; LoDTensorBlockingQueueHolder queue_holder;
int capacity = 64; int capacity = 64;
......
include(operators) include(operators)
register_operators() if(WITH_GPU)
register_operators(DEPS cub)
else()
register_operators()
endif()
if(WITH_GPU) if(WITH_GPU)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
py_proto_compile(profiler_py_proto SRCS profiler.proto) py_proto_compile(profiler_py_proto SRCS profiler.proto)
add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
...@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) ...@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce boost) cc_library(place SRCS place.cc DEPS enforce boost lib_any)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload) add_subdirectory(dynload)
......
...@@ -23,6 +23,26 @@ limitations under the License. */ ...@@ -23,6 +23,26 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
std::shared_ptr<ngraph::Node> Nhwc2Nchw(std::shared_ptr<ngraph::Node> in) {
auto in_shape = in->get_shape();
in_shape[0] = in->get_shape()[0];
in_shape[1] = in->get_shape()[3];
in_shape[2] = in->get_shape()[1];
in_shape[3] = in->get_shape()[2];
ngraph::AxisVector axis_vec = {0, 3, 1, 2};
return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
}
std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
auto in_shape = in->get_shape();
in_shape[0] = in->get_shape()[0];
in_shape[1] = in->get_shape()[2];
in_shape[2] = in->get_shape()[3];
in_shape[3] = in->get_shape()[1];
ngraph::AxisVector axis_vec = {0, 2, 3, 1};
return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
}
ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) { ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1, auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
std::multiplies<size_t>()); std::multiplies<size_t>());
......
...@@ -14,6 +14,12 @@ limitations under the License. */ ...@@ -14,6 +14,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -26,5 +26,5 @@ if(WITH_PYTHON) ...@@ -26,5 +26,5 @@ if(WITH_PYTHON)
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_pybind ${os_dependency_modules}) target_link_libraries(paddle_pybind ${os_dependency_modules})
cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind)
endif(WITH_PYTHON) endif(WITH_PYTHON)
...@@ -74,12 +74,12 @@ void BindPaddleBuf(py::module *m) { ...@@ -74,12 +74,12 @@ void BindPaddleBuf(py::module *m) {
.def(py::init([](std::vector<float> &data) { .def(py::init([](std::vector<float> &data) {
auto buf = PaddleBuf(data.size() * sizeof(float)); auto buf = PaddleBuf(data.size() * sizeof(float));
std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length()); std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
return std::move(buf); return buf;
})) }))
.def(py::init([](std::vector<int64_t> &data) { .def(py::init([](std::vector<int64_t> &data) {
auto buf = PaddleBuf(data.size() * sizeof(int64_t)); auto buf = PaddleBuf(data.size() * sizeof(int64_t));
std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length()); std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
return std::move(buf); return buf;
})) }))
.def("resize", &PaddleBuf::Resize) .def("resize", &PaddleBuf::Resize)
.def("reset", .def("reset",
......
...@@ -295,6 +295,7 @@ PYBIND11_MODULE(core, m) { ...@@ -295,6 +295,7 @@ PYBIND11_MODULE(core, m) {
.def("_get_float_element", TensorGetElement<float>) .def("_get_float_element", TensorGetElement<float>)
.def("_set_double_element", TensorSetElement<double>) .def("_set_double_element", TensorSetElement<double>)
.def("_get_double_element", TensorGetElement<double>) .def("_get_double_element", TensorGetElement<double>)
.def("_place", [](Tensor &self) { return self.place(); })
.def("_dtype", [](Tensor &self) { return self.type(); }); .def("_dtype", [](Tensor &self) { return self.type(); });
py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC( py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
...@@ -673,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -673,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<platform::Place>(m, "Place") py::class_<platform::Place>(m, "Place")
.def(py::init<>()) .def(py::init<>())
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
.def("gpu_device_id",
[](platform::Place &self) {
return boost::get<platform::CUDAPlace>(self).device;
})
.def("set_place", .def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) { [](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place; self = cpu_place;
...@@ -1092,10 +1099,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1092,10 +1099,6 @@ All parameter, weight, gradient are variables in Paddle.
"is_distribution", "is_distribution",
[](const BuildStrategy &self) { return self.is_distribution_; }, [](const BuildStrategy &self) { return self.is_distribution_; },
[](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
.def_property(
"memory_early_delete",
[](const BuildStrategy &self) { return self.memory_early_delete_; },
[](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
.def_property( .def_property(
"enable_inplace", "enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; }, [](const BuildStrategy &self) { return self.enable_inplace_; },
......
...@@ -54,7 +54,7 @@ ELSE(WIN32) ...@@ -54,7 +54,7 @@ ELSE(WIN32)
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
ENDIF() ENDIF()
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies})
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
......
...@@ -25,4 +25,5 @@ import paddle.reader ...@@ -25,4 +25,5 @@ import paddle.reader
import paddle.dataset import paddle.dataset
import paddle.batch import paddle.batch
import paddle.compat import paddle.compat
import paddle.distributed
batch = batch.batch batch = batch.batch
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -37,7 +37,7 @@ default_envs = { ...@@ -37,7 +37,7 @@ default_envs = {
GPUS = 8 GPUS = 8
def start_procs(gpus, cmd, log_dir): def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
procs = [] procs = []
log_fns = [] log_fns = []
os.system("mkdir -p %s" % log_dir) os.system("mkdir -p %s" % log_dir)
...@@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir): ...@@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir):
"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
}) })
print("starting process ", i, cmd, curr_env) print("starting process ", i, entrypoint, entrypoint_args, curr_env)
fn = open("%s/workerlog.%d" % (log_dir, i), "w") fn = open("%s/workerlog.%d" % (log_dir, i), "w")
log_fns.append(fn) log_fns.append(fn)
procs.append( cmd = [sys.executable, "-u", entrypoint] + entrypoint_args
subprocess.Popen( procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env))
cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
for i in range(gpus): for i in range(gpus):
try: try:
...@@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir): ...@@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir):
pass pass
def main(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='''start paddle training using multi-process mode. description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode, NOTE: your train program ***must*** run as distributed nccl2 mode,
...@@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training) ...@@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training)
type=int, type=int,
default=8, default=8,
help='start number of processes for every gpu') help='start number of processes for every gpu')
parser.add_argument(
'--cmd',
type=str,
default="",
help='command to run for each process, e.g. python train.py --lr 0.1')
parser.add_argument( parser.add_argument(
'--log_dir', '--log_dir',
type=str, type=str,
default="mylog", default="mylog",
help='directory to put logs per process.') help='directory to put logs per process.')
args = parser.parse_args() parser.add_argument(
if args.cmd == "": 'entrypoint_script',
parser.print_help() type=str,
exit(0) help="The entrypoint script to be launched in parallel,"
start_procs(args.gpus, args.cmd, args.log_dir) "followed by all the arguments for each process,"
"e.g. train.py --lr 0.1")
parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
# launch multiple training process
start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args,
args.log_dir)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -161,7 +161,6 @@ def __bootstrap__(): ...@@ -161,7 +161,6 @@ def __bootstrap__():
'times_excess_than_required_tmp_allocation', 'times_excess_than_required_tmp_allocation',
'enable_inplace_whitelist' 'enable_inplace_whitelist'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
...@@ -22,7 +22,7 @@ This API is still under active development and may change drastically. ...@@ -22,7 +22,7 @@ This API is still under active development and may change drastically.
from __future__ import print_function from __future__ import print_function
import contextlib from ...wrapped_decorator import signature_safe_contextmanager
import numpy as np import numpy as np
import six import six
...@@ -419,7 +419,7 @@ class TrainingDecoder(object): ...@@ -419,7 +419,7 @@ class TrainingDecoder(object):
self._state_cell = state_cell self._state_cell = state_cell
self._state_cell._enter_decoder(self) self._state_cell._enter_decoder(self)
@contextlib.contextmanager @signature_safe_contextmanager
def block(self): def block(self):
""" """
Define the behavior of the decoder for each RNN time step. Define the behavior of the decoder for each RNN time step.
...@@ -613,7 +613,7 @@ class BeamSearchDecoder(object): ...@@ -613,7 +613,7 @@ class BeamSearchDecoder(object):
self._word_dim = word_dim self._word_dim = word_dim
self._input_var_dict = input_var_dict self._input_var_dict = input_var_dict
@contextlib.contextmanager @signature_safe_contextmanager
def block(self): def block(self):
""" """
Define the behavior of the decoder for each RNN time step. Define the behavior of the decoder for each RNN time step.
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import contextlib from ..wrapped_decorator import signature_safe_contextmanager
from .. import core from .. import core
...@@ -105,7 +105,7 @@ class Inferencer(object): ...@@ -105,7 +105,7 @@ class Inferencer(object):
return results return results
@contextlib.contextmanager @signature_safe_contextmanager
def _prog_and_scope_guard(self): def _prog_and_scope_guard(self):
with framework.program_guard(main_program=self.inference_program): with framework.program_guard(main_program=self.inference_program):
with executor.scope_guard(self.scope): with executor.scope_guard(self.scope):
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import contextlib from ..wrapped_decorator import signature_safe_contextmanager
import os import os
import errno import errno
import shutil import shutil
...@@ -453,7 +453,7 @@ class Trainer(object): ...@@ -453,7 +453,7 @@ class Trainer(object):
io.save_inference_model(param_path, feeded_var_names, target_vars, io.save_inference_model(param_path, feeded_var_names, target_vars,
exe) exe)
@contextlib.contextmanager @signature_safe_contextmanager
def _prog_and_scope_guard(self): def _prog_and_scope_guard(self):
with framework.program_guard( with framework.program_guard(
main_program=self.train_program, main_program=self.train_program,
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import os import os
import multiprocessing import multiprocessing
import numpy as np import numpy as np
import contextlib from .wrapped_decorator import signature_safe_contextmanager
import six import six
from .framework import Program, default_main_program, Variable from .framework import Program, default_main_program, Variable
from . import core from . import core
...@@ -49,7 +49,7 @@ def _switch_scope(scope): ...@@ -49,7 +49,7 @@ def _switch_scope(scope):
return ex return ex
@contextlib.contextmanager @signature_safe_contextmanager
def scope_guard(scope): def scope_guard(scope):
""" """
Change the global/default scope instance by Python `with` statement. All Change the global/default scope instance by Python `with` statement. All
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import collections import collections
from collections import defaultdict from collections import defaultdict
import contextlib from .wrapped_decorator import signature_safe_contextmanager
import os import os
import re import re
import traceback import traceback
...@@ -111,7 +111,7 @@ class NameScope(object): ...@@ -111,7 +111,7 @@ class NameScope(object):
_name_scope = NameScope() _name_scope = NameScope()
@contextlib.contextmanager @signature_safe_contextmanager
def name_scope(prefix=None): def name_scope(prefix=None):
""" """
Generate hierarchical name prefix for the operators. Generate hierarchical name prefix for the operators.
...@@ -1775,7 +1775,7 @@ class Program(object): ...@@ -1775,7 +1775,7 @@ class Program(object):
def set_op_role_var(self, var_name): def set_op_role_var(self, var_name):
self._op_role_var = [var_name] self._op_role_var = [var_name]
@contextlib.contextmanager @signature_safe_contextmanager
def _optimized_guard(self, param_and_grads): def _optimized_guard(self, param_and_grads):
""" """
A with guard to set :code:`Optimization` :code:`OpRole` and A with guard to set :code:`Optimization` :code:`OpRole` and
...@@ -1805,7 +1805,7 @@ class Program(object): ...@@ -1805,7 +1805,7 @@ class Program(object):
self._op_role_var = tmp_var self._op_role_var = tmp_var
self._current_role = tmp_role self._current_role = tmp_role
@contextlib.contextmanager @signature_safe_contextmanager
def _lr_schedule_guard(self, is_with_opt=False): def _lr_schedule_guard(self, is_with_opt=False):
""" """
A with guard to set :code:`LRSched` :code:`OpRole` and A with guard to set :code:`LRSched` :code:`OpRole` and
...@@ -2459,7 +2459,7 @@ def switch_startup_program(program): ...@@ -2459,7 +2459,7 @@ def switch_startup_program(program):
return prev_program return prev_program
@contextlib.contextmanager @signature_safe_contextmanager
def program_guard(main_program, startup_program=None): def program_guard(main_program, startup_program=None):
""" """
Change the global main program and startup program with `with` statement. Change the global main program and startup program with `with` statement.
...@@ -2524,7 +2524,7 @@ def _get_var(name, program=None): ...@@ -2524,7 +2524,7 @@ def _get_var(name, program=None):
return program.global_block().var(name) return program.global_block().var(name)
@contextlib.contextmanager @signature_safe_contextmanager
def _imperative_guard(tracer): def _imperative_guard(tracer):
global _imperative_tracer_ global _imperative_tracer_
tmp_trace = _imperative_tracer_ tmp_trace = _imperative_tracer_
...@@ -2535,7 +2535,7 @@ def _imperative_guard(tracer): ...@@ -2535,7 +2535,7 @@ def _imperative_guard(tracer):
_imperative_tracer_ = tmp_trace _imperative_tracer_ = tmp_trace
@contextlib.contextmanager @signature_safe_contextmanager
def _imperative_place_guard(place): def _imperative_place_guard(place):
global _imperative_current_expected_place_ global _imperative_current_expected_place_
tmp_place = _imperative_current_expected_place_ tmp_place = _imperative_current_expected_place_
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import contextlib from ..wrapped_decorator import signature_safe_contextmanager
import numpy as np import numpy as np
from paddle.fluid import core from paddle.fluid import core
...@@ -24,7 +24,7 @@ def enabled(): ...@@ -24,7 +24,7 @@ def enabled():
return framework._in_imperative_mode() return framework._in_imperative_mode()
@contextlib.contextmanager @signature_safe_contextmanager
def guard(place=None): def guard(place=None):
train = framework.Program() train = framework.Program()
startup = framework.Program() startup = framework.Program()
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
from . import framework from . import framework
import numpy as np import numpy as np
import contextlib from .wrapped_decorator import signature_safe_contextmanager
from .core import VarDesc from .core import VarDesc
from . import unique_name from . import unique_name
...@@ -49,7 +49,7 @@ def force_init_on_cpu(): ...@@ -49,7 +49,7 @@ def force_init_on_cpu():
return _force_init_on_cpu_ return _force_init_on_cpu_
@contextlib.contextmanager @signature_safe_contextmanager
def init_on_cpu(): def init_on_cpu():
""" """
Force the variable to be inited on CPU. Force the variable to be inited on CPU.
......
...@@ -302,7 +302,8 @@ class LayerHelper(object): ...@@ -302,7 +302,8 @@ class LayerHelper(object):
if default_initializer is None and attr.initializer is None: if default_initializer is None and attr.initializer is None:
if isinstance(dtype, core.VarDesc.VarType): if isinstance(dtype, core.VarDesc.VarType):
if dtype != core.VarDesc.VarType.FP32 and \ if dtype != core.VarDesc.VarType.FP32 and \
dtype != core.VarDesc.VarType.FP64: dtype != core.VarDesc.VarType.FP64 and \
dtype != core.VarDesc.VarType.FP16:
raise TypeError( raise TypeError(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
) )
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import contextlib from ..wrapped_decorator import signature_safe_contextmanager
from .layer_function_generator import autodoc, templatedoc from .layer_function_generator import autodoc, templatedoc
from .tensor import assign, fill_constant from .tensor import assign, fill_constant
...@@ -1532,7 +1532,7 @@ class DynamicRNN(object): ...@@ -1532,7 +1532,7 @@ class DynamicRNN(object):
outputs={'Out': [x_reordered]}) outputs={'Out': [x_reordered]})
return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table) return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
@contextlib.contextmanager @signature_safe_contextmanager
def block(self): def block(self):
""" """
The block for user to define operators in RNN. See the class docstring The block for user to define operators in RNN. See the class docstring
......
...@@ -397,10 +397,10 @@ def box_coder(prior_box, ...@@ -397,10 +397,10 @@ def box_coder(prior_box,
input is image feature map, they are close to input is image feature map, they are close to
the origin of the coordinate system. [xmax, ymax] the origin of the coordinate system. [xmax, ymax]
is the right bottom coordinate of the anchor box. is the right bottom coordinate of the anchor box.
prior_box_var(Variable|list): prior_box_var supports two types of input. prior_box_var(Variable|list|None): prior_box_var supports two types
One is variable with shape [M, 4] holds M group. of input. One is variable with shape [M, 4]
The other one is list consist of 4 elements holds M group. The other one is list consist of
shared by all boxes. 4 elements shared by all boxes.
target_box(Variable): This input can be a 2-D LoDTensor with shape target_box(Variable): This input can be a 2-D LoDTensor with shape
[N, 4] when code_type is 'encode_center_size'. [N, 4] when code_type is 'encode_center_size'.
This input also can be a 3-D Tensor with shape This input also can be a 3-D Tensor with shape
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import contextlib from ..wrapped_decorator import signature_safe_contextmanager
import multiprocessing import multiprocessing
import os import os
import six import six
...@@ -1116,7 +1116,7 @@ class Preprocessor(object): ...@@ -1116,7 +1116,7 @@ class Preprocessor(object):
def _is_completed(self): def _is_completed(self):
return self.sub_block and self.source_var_names and self.sink_var_names return self.sub_block and self.source_var_names and self.sink_var_names
@contextlib.contextmanager @signature_safe_contextmanager
def block(self): def block(self):
self.status = Preprocessor.IN_SUB_BLOCK self.status = Preprocessor.IN_SUB_BLOCK
self.sub_block = self.main_prog._create_block() self.sub_block = self.main_prog._create_block()
......
...@@ -2930,6 +2930,7 @@ def batch_norm(input, ...@@ -2930,6 +2930,7 @@ def batch_norm(input,
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": is_test, "is_test": is_test,
"data_layout": data_layout,
"use_mkldnn": False, "use_mkldnn": False,
"fuse_with_relu": fuse_with_relu, "fuse_with_relu": fuse_with_relu,
"use_global_stats": use_global_stats "use_global_stats": use_global_stats
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
from collections import defaultdict from collections import defaultdict
from contextlib import contextmanager from .wrapped_decorator import signature_safe_contextmanager
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
...@@ -1610,7 +1610,7 @@ class ModelAverage(Optimizer): ...@@ -1610,7 +1610,7 @@ class ModelAverage(Optimizer):
}, },
stop_gradient=True) stop_gradient=True)
@contextmanager @signature_safe_contextmanager
def apply(self, executor, need_restore=True): def apply(self, executor, need_restore=True):
"""Apply average values to parameters of current model. """Apply average values to parameters of current model.
""" """
......
...@@ -148,7 +148,8 @@ class ParallelExecutor(object): ...@@ -148,7 +148,8 @@ class ParallelExecutor(object):
else framework.default_main_program() else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
build_strategy.enable_inplace = False if main._is_mem_optimized else True if build_strategy.enable_inplace is None:
build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope() scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
from . import core from . import core
from contextlib import contextmanager from .wrapped_decorator import signature_safe_contextmanager
import os import os
import six import six
...@@ -35,7 +35,7 @@ NVPROF_CONFIG = [ ...@@ -35,7 +35,7 @@ NVPROF_CONFIG = [
] ]
@contextmanager @signature_safe_contextmanager
def cuda_profiler(output_file, output_mode=None, config=None): def cuda_profiler(output_file, output_mode=None, config=None):
"""The CUDA profiler. """The CUDA profiler.
This fuctions is used to profile CUDA program by CUDA runtime application This fuctions is used to profile CUDA program by CUDA runtime application
...@@ -217,7 +217,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): ...@@ -217,7 +217,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
core.disable_profiler(key_map[sorted_key], profile_path) core.disable_profiler(key_map[sorted_key], profile_path)
@contextmanager @signature_safe_contextmanager
def profiler(state, sorted_key=None, profile_path='/tmp/profile'): def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface. """The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU Different from cuda_profiler, this profiler can be used to profile both CPU
......
...@@ -15,14 +15,14 @@ ...@@ -15,14 +15,14 @@
from __future__ import print_function from __future__ import print_function
import os import os
import contextlib from .wrapped_decorator import signature_safe_contextmanager
from . import core from . import core
__all__ = [ __all__ = [
'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files' 'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
] ]
@contextlib.contextmanager @signature_safe_contextmanager
def create_recordio_writer(filename, def create_recordio_writer(filename,
compressor=core.RecordIOWriter.Compressor.Snappy, compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000): max_num_records=1000):
......
...@@ -109,11 +109,12 @@ set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) ...@@ -109,11 +109,12 @@ set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
if(NOT APPLE) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
endif()
endif() endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 900, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900)
endif()
if (WITH_NGRAPH) if (WITH_NGRAPH)
add_subdirectory(ngraph) add_subdirectory(ngraph)
......
...@@ -16,14 +16,37 @@ from __future__ import print_function ...@@ -16,14 +16,37 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
class TestNGRAPHAccuracyOp(TestAccuracyOp): class TestNGRAPHAccuracyOp(OpTest):
def setUp(self): def setUp(self):
super(TestNGRAPHAccuracyOp, self).setUp() self.op_type = "accuracy"
self.dtype = np.float32
self.init_dtype()
n = 128
infer = np.random.random((n, 1)).astype(self.dtype)
indices = np.random.randint(0, 2, (n, 1))
label = np.random.randint(0, 2, (n, 1))
self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
num_correct = 0
for rowid in range(n):
for ele in indices[rowid]:
if ele == label[rowid]:
num_correct += 1
break
self.outputs = {
'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
'Correct': np.array([num_correct]).astype("int64"),
'Total': np.array([n]).astype("int64")
}
self._cpu_only = True
def init_dtype(self):
pass
def test_check_output(self):
self.check_output()
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -18,17 +18,7 @@ import unittest ...@@ -18,17 +18,7 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh from paddle.fluid.tests.unittests.test_activation_op import TestSigmoid, TestRelu, TestTanh
class TestNGRAPHReluDim2(TestRelu):
def setUp(self):
super(TestNGRAPHReluDim2, self).setUp()
class TestNGRAPHTanhDim2(TestTanh):
def setUp(self):
super(TestNGRAPHTanhDim2, self).setUp()
class TestNGRAPHReluDim4(TestRelu): class TestNGRAPHReluDim4(TestRelu):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference
class TestNGRAPHBatchNormOpTraining(TestBatchNormOpTraining):
def init_kernel_type(self):
super(TestNGRAPHBatchNormOpTraining, self).init_kernel_type()
class TestNGRAPHBatchNormOpInference(TestBatchNormOpInference):
def init_kernel_type(self):
super(TestNGRAPHBatchNormOpInference, self).init_kernel_type()
class TestNGRAPHBatchNormOpWithReluInference(TestBatchNormOpInference):
def init_kernel_type(self):
super(TestNGRAPHBatchNormOpWithReluInference, self).init_kernel_type()
if __name__ == '__main__':
unittest.main()
...@@ -15,35 +15,59 @@ ...@@ -15,35 +15,59 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from paddle.fluid.tests.unittests.test_conv2d_op import * from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
class TestNGRAPH(TestConv2dOp): class TestNGRAPH(TestConv2dOp):
def setUp(self):
super(TestNGRAPH, self).setUp()
self._cpu_only = True
def init_kernel_type(self): def init_kernel_type(self):
super(TestNGRAPH, self).init_kernel_type() super(TestNGRAPH, self).init_kernel_type()
class TestNGRAPHWithPad(TestWithPad): class TestNGRAPHWithPad(TestWithPad):
def setUp(self):
super(TestNGRAPHWithPad, self).setUp()
self._cpu_only = True
def init_kernel_type(self): def init_kernel_type(self):
super(TestNGRAPHWithPad, self).init_kernel_type() super(TestNGRAPHWithPad, self).init_kernel_type()
class TestNGRAPHWithStride(TestWithStride): class TestNGRAPHWithStride(TestWithStride):
def setUp(self):
super(TestNGRAPHWithStride, self).setUp()
self._cpu_only = True
def init_kernel_type(self): def init_kernel_type(self):
super(TestNGRAPHWithStride, self).init_kernel_type() super(TestNGRAPHWithStride, self).init_kernel_type()
class TestNGRAPHWithGroup(TestWithGroup): class TestNGRAPHWithGroup(TestWithGroup):
def setUp(self):
super(TestNGRAPHWithGroup, self).setUp()
self._cpu_only = True
def init_kernel_type(self): def init_kernel_type(self):
super(TestNGRAPHWithGroup, self).init_kernel_type() super(TestNGRAPHWithGroup, self).init_kernel_type()
class TestNGRAPHWith1x1(TestWith1x1): class TestNGRAPHWith1x1(TestWith1x1):
def setUp(self):
super(TestNGRAPHWith1x1, self).setUp()
self._cpu_only = True
def init_kernel_type(self): def init_kernel_type(self):
super(TestNGRAPHWith1x1, self).init_kernel_type() super(TestNGRAPHWith1x1, self).init_kernel_type()
class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def setUp(self):
super(TestNGRAPHWithInput1x1Filter1x1, self).setUp()
self._cpu_only = True
def init_kernel_type(self): def init_kernel_type(self):
super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type() super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
......
...@@ -14,73 +14,16 @@ ...@@ -14,73 +14,16 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from paddle.fluid.tests.unittests.test_elementwise_add_op import * from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp): class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp):
def init_input_output(self): def setUp(self):
super(TestNGRAPHElementwiseAddOp, self).init_input_output() super(TestNGRAPHElementwiseAddOp, self).setUp()
self._cpu_only = True
class TestNGRAPHElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_scalar, self).init_input_output()
class TestNGRAPHElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_scalar2, self).init_input_output()
class TestNGRAPHElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_Vector, self).init_input_output()
class TesNGRAPHtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
def init_input_output(self):
super(TesNGRAPHtElementwiseAddOp_broadcast_0, self).init_input_output()
class TestNGRAPHElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_broadcast_1, self).init_input_output()
class TestNGRAPHElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
def init_input_output(self): def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_broadcast_2, self).init_input_output() super(TestNGRAPHElementwiseAddOp, self).init_input_output()
class TestNGRAPHElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_broadcast_3, self).init_input_output()
class TestNGRAPHElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_broadcast_4, self).init_input_output()
class TestNGRAPHElementwiseAddOp_rowwise_add_0(
TestElementwiseAddOp_rowwise_add_0):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_rowwise_add_0,
self).init_input_output()
class TestNGRAPHElementwiseAddOp_rowwise_add_1(
TestElementwiseAddOp_rowwise_add_1):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_rowwise_add_1,
self).init_input_output()
class TestNGRAPHElementwiseAddOp_channelwise_add(
TestElementwiseAddOp_channelwise_add):
def init_input_output(self):
super(TestNGRAPHElementwiseAddOp_channelwise_add,
self).init_input_output()
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -14,17 +14,13 @@ ...@@ -14,17 +14,13 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp
class TestNGRAPHMeanOp(TestMeanOp): class TestNGRAPHMeanOp(TestMeanOp):
def setUp(self): def setUp(self):
super(TestNGRAPHMeanOp, self).setUp() super(TestNGRAPHMeanOp, self).setUp()
self._cpu_only = True
class TestNGRAPHFP16MeanOp(TestFP16MeanOp):
def setUp(self):
super(TestNGRAPHFP16MeanOp, self).setUp()
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,27 +15,38 @@ ...@@ -15,27 +15,38 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2 import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest
class TestNGRAPHMulOp(OpTest):
def setUp(self):
self.op_type = "mul"
self.dtype = np.float32
self.init_dtype_type()
self.inputs = {
'X': np.random.random((2, 4)).astype(self.dtype),
'Y': np.random.random((4, 4)).astype(self.dtype)
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
self._cpu_only = True
class TestNGRAPHMulOp(TestMulOp):
def init_dtype_type(self): def init_dtype_type(self):
pass pass
def test_check_output(self):
self.check_output()
class TestNGRAPHMulOp2(TestMulOp2): def test_check_grad_normal(self):
def init_dtype_type(self): self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
pass
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
class TestNGRAPHFP16MulOp1(TestFP16MulOp1): def test_check_grad_ingore_y(self):
def init_dtype_type(self): self.check_grad(
pass ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
class TestNGRAPHFP16MulOp2(TestFP16MulOp2):
def init_dtype_type(self):
pass
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -14,35 +14,59 @@ ...@@ -14,35 +14,59 @@
from __future__ import print_function from __future__ import print_function
from paddle.fluid.tests.unittests.test_pool2d_op import * from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
class TestNGRAPHPool2D_Op(TestPool2D_Op): class TestNGRAPHPool2D_Op(TestPool2D_Op):
def setUp(self):
super(TestNGRAPHPool2D_Op, self).setUp()
self._cpu_only = True
def init_test_case(self): def init_test_case(self):
super(TestNGRAPHPool2D_Op, self).init_test_case() super(TestNGRAPHPool2D_Op, self).init_test_case()
class TestNGRAPHCase1(TestCase1): class TestNGRAPHCase1(TestCase1):
def setUp(self):
super(TestNGRAPHCase1, self).setUp()
self._cpu_only = True
def init_test_case(self): def init_test_case(self):
super(TestNGRAPHCase1, self).init_test_case() super(TestNGRAPHCase1, self).init_test_case()
class TestNGRAPHCase2(TestCase2): class TestNGRAPHCase2(TestCase2):
def setUp(self):
super(TestNGRAPHCase2, self).setUp()
self._cpu_only = True
def init_test_case(self): def init_test_case(self):
super(TestNGRAPHCase2, self).init_test_case() super(TestNGRAPHCase2, self).init_test_case()
class TestNGRAPHCase3(TestCase3): class TestNGRAPHCase3(TestCase3):
def setUp(self):
super(TestNGRAPHCase3, self).setUp()
self._cpu_only = True
def init_pool_type(self): def init_pool_type(self):
super(TestNGRAPHCase3, self).init_pool_type() super(TestNGRAPHCase3, self).init_pool_type()
class TestNGRAPHCase4(TestCase4): class TestNGRAPHCase4(TestCase4):
def setUp(self):
super(TestNGRAPHCase4, self).setUp()
self._cpu_only = True
def init_pool_type(self): def init_pool_type(self):
super(TestNGRAPHCase4, self).init_pool_type() super(TestNGRAPHCase4, self).init_pool_type()
class TestNGRAPHCase5(TestCase5): class TestNGRAPHCase5(TestCase5):
def setUp(self):
super(TestNGRAPHCase5, self).setUp()
self._cpu_only = True
def init_pool_type(self): def init_pool_type(self):
super(TestNGRAPHCase5, self).init_pool_type() super(TestNGRAPHCase5, self).init_pool_type()
......
...@@ -13,25 +13,23 @@ ...@@ -13,25 +13,23 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows
class TestNGRAPHScaleOp(TestScaleOp): class TestNGRAPHScaleOp(TestScaleOp):
def init_dtype_type(self): def setUp(self):
pass super(TestNGRAPHScaleOp, self).setUp()
self._cpu_only = True
class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
def init_dtype_type(self): def init_dtype_type(self):
pass pass
class TestNGRAPHScaleFp16Op(TestScaleFp16Op): class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
def init_dtype_type(self): def setUp(self):
pass super(TestNGRAPHScaleOpSelectedRows, self).setUp()
self._cpu_only = True
class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows):
def init_dtype_type(self): def init_dtype_type(self):
pass pass
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_sum_op import TestSumOp, TestSelectedRowsSumOp, TestLoDTensorAndSelectedRowsOp
if __name__ == "__main__":
unittest.main()
...@@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, ...@@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d,
class TestNGRAPHTopkOp(TestTopkOp): class TestNGRAPHTopkOp(TestTopkOp):
def setUp(self): def setUp(self):
super(TestNGRAPHTopkOp, self).setUp() super(TestNGRAPHTopkOp, self).setUp()
self._cpu_only = True
class TestNGRAPHTopkOp2(TestTopkOp2): class TestNGRAPHTopkOp2(TestTopkOp2):
def setUp(self): def setUp(self):
super(TestNGRAPHTopkOp2, self).setUp() super(TestNGRAPHTopkOp2, self).setUp()
self._cpu_only = True
class TestNGRAPHTopkOp3(TestTopkOp3): class TestNGRAPHTopkOp3(TestTopkOp3):
def setUp(self): def setUp(self):
super(TestNGRAPHTopkOp3, self).setUp() super(TestNGRAPHTopkOp3, self).setUp()
self._cpu_only = True
class TestNGRAPHTopkOp4(TestTopkOp4): class TestNGRAPHTopkOp4(TestTopkOp4):
def setUp(self): def setUp(self):
super(TestNGRAPHTopkOp4, self).setUp() super(TestNGRAPHTopkOp4, self).setUp()
self._cpu_only = True
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): ...@@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
pb_y = pb_y.reshape(shape) pb_y = pb_y.reshape(shape)
if pb_v.ndim == 2: if pb_v.ndim == 2:
pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else (
pb_v.shape[0], 1, pb_v.shape[1])
pb_v = pb_v.reshape(var_shape)
if pb_v.ndim == 1: if pb_v.ndim == 1:
tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
...@@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest): ...@@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest):
self.outputs = {'OutputBox': output_box} self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithOneRankVar(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((81, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32')
target_box = np.random.random((20, 81, 4)).astype('float32')
code_type = "DecodeCenterSize"
box_normalized = False
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type, box_normalized)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {
'code_type': 'decode_center_size',
'box_normalized': False
}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithoutBoxVar(OpTest): class TestBoxCoderOpWithoutBoxVar(OpTest):
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest): ...@@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest):
self.op_type = "box_coder" self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]] lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((30, 4)).astype('float32') prior_box = np.random.random((30, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32') prior_box_var = np.random.random((30, 4)).astype('float32')
target_box = np.random.random((30, 81, 4)).astype('float32') target_box = np.random.random((30, 81, 4)).astype('float32')
code_type = "DecodeCenterSize" code_type = "DecodeCenterSize"
box_normalized = False box_normalized = False
......
...@@ -109,5 +109,32 @@ class TestExpandOpRank4(OpTest): ...@@ -109,5 +109,32 @@ class TestExpandOpRank4(OpTest):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out')
class TestExpandOpInteger(OpTest):
def setUp(self):
self.op_type = "expand"
self.inputs = {
'X': np.random.randint(
10, size=(2, 4, 5)).astype("int32")
}
self.attrs = {'expand_times': [2, 1, 4]}
output = np.tile(self.inputs['X'], (2, 1, 4))
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestExpandOpBoolean(OpTest):
def setUp(self):
self.op_type = "expand"
self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
self.attrs = {'expand_times': [2, 1, 4]}
output = np.tile(self.inputs['X'], (2, 1, 4))
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
import collections import collections
import contextlib from .wrapped_decorator import signature_safe_contextmanager
import six import six
import sys import sys
...@@ -68,7 +68,7 @@ def switch(new_generator=None): ...@@ -68,7 +68,7 @@ def switch(new_generator=None):
return old return old
@contextlib.contextmanager @signature_safe_contextmanager
def guard(new_generator=None): def guard(new_generator=None):
if isinstance(new_generator, six.string_types): if isinstance(new_generator, six.string_types):
new_generator = UniqueNameGenerator(new_generator) new_generator = UniqueNameGenerator(new_generator)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import decorator
import contextlib
__all__ = ['wrap_decorator', 'signature_safe_contextmanager']
def wrap_decorator(decorator_func):
@decorator.decorator
def __impl__(func, *args, **kwargs):
wrapped_func = decorator_func(func)
return wrapped_func(*args, **kwargs)
return __impl__
signature_safe_contextmanager = wrap_decorator(contextlib.contextmanager)
...@@ -11,3 +11,4 @@ graphviz ...@@ -11,3 +11,4 @@ graphviz
six six
funcsigs funcsigs
pyyaml pyyaml
decorator
...@@ -100,6 +100,7 @@ packages=['paddle', ...@@ -100,6 +100,7 @@ packages=['paddle',
'paddle.utils', 'paddle.utils',
'paddle.dataset', 'paddle.dataset',
'paddle.reader', 'paddle.reader',
'paddle.distributed',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.imperative', 'paddle.fluid.imperative',
'paddle.fluid.proto', 'paddle.fluid.proto',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册