diff --git a/CMakeLists.txt b/CMakeLists.txt index e4442d254901e2524385452ebe5ac6f6df3056f9..61f5e63098c40f140774ba6bfd9a2de8d2d67bfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) + set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") endif(WIN32) find_package(CUDA QUIET) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 076e839120d98d801de4374f2f8338ebd918b88f..b0f54bf49aafb65f1a92fa95877de2cc61fc67d3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -152,7 +152,12 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - set(OPENMP_FLAGS "-fopenmp") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index ea46f6418edf1db70b2a308dd49cf2131cc89d3b..ef4192ecc98ea6de0c81c1f33320528d547b818a 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") if (NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -endif() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + endif() else(NOT WIN32) -list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") - # match the cl's _ITERATOR_DEBUG_LEVEL - list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") -else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") + list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 7a6a4523886824a67c82f9ce978de025ddb9c2cd..d3a4d69d3a05515fdf72074083470e19b4ec255c 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire IF(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") ELSE(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) @@ -39,7 +41,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 6a7be73f09a278ab0fd29c7599a7781df3d29413..92fe76d05c7507c295b784bc37870abfc31a0a29 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -49,6 +49,8 @@ IF(NOT WIN32) SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ELSE() + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") ENDIF(NOT WIN32) ExternalProject_Add( @@ -61,7 +63,6 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 27d075336d556528ffaf1929c34753494692f0a0..1e01057aa606af78cd722d3619a710cb35817174 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) +if(WIN32) + SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +else() + SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif() + ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -31,7 +37,7 @@ ExternalProject_Add( -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9e6c47f016fe6dfd809c5b2bc88ff59d0a6b2b84..36b533aa4f7815896fb48c33fefad892b8d0d29c 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") endif() - endif() + endif() endif() endfunction() @@ -147,12 +147,7 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) - -else(NOT WIN32) -set(COMMON_FLAGS - "/w") #disable all warnings. -set(GPU_COMMON_FLAGS - "/w") #disable all warnings +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif(NOT WIN32) if (APPLE) @@ -193,8 +188,7 @@ safe_set_static_flag() CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/W3") - string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") + set(flag_var "${flag_var} /w") endforeach(flag_var) endif(WIN32) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index f50a38842a21c795c979f859e88a9b16c3e54bd8..df961be911537582fb60dd258fcdd4a5dd41a38e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -8,13 +8,13 @@ paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) @@ -66,7 +66,7 @@ paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'unifo paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) @@ -229,7 +229,7 @@ paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)) @@ -270,7 +270,7 @@ paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywo paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')) paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) @@ -346,12 +346,12 @@ paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'st paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)) -paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) @@ -456,7 +456,7 @@ paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', ' paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) -paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -491,14 +491,14 @@ paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None) paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)) -paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None) paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None) paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 910318a49cea50fadd29b1427a4591abfa5d5a23..7ddf1ab44fe096739f4d241994e5cb686970a7c5 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) -if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog - lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) +if(WITH_NGRAPH) + set(NGRAPH_EXE_DEPS ngraph_engine) +else() + set(NGRAPH_EXE_DEPS) +endif() - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +if(WITH_DISTRIBUTE) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - if (WITH_NGRAPH) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine) - else () - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif() + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 1d9678a1ba1409e5c18d3e25b3aa13dfbbf76908..60708bf609d6f8b327d46fe585cbbcf07a62eece 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, auto& block = main_program.Block(0); for (auto var_name : fetch_var_names) { auto var_desc = block.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name); auto shapes = var_desc->GetShape(); PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1, "var %s: Fetched var has wrong shape, " diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 6621a59d37a670f7025507faeab5b9897794a72e..e88084424baf7eb5cd1d67ea19966866d71ec3eb 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() -cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) -cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass) - +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 51ce9732722efa44d2489f5b77694094e58c8775..f8030c53f72bc8a6f007c1eb6a3072abd8037de2 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -206,8 +206,6 @@ std::unique_ptr BuildStrategy::Apply( new std::vector(main_program.Block(0).AllOps()); graph->Set>(kAllOpDescs, all_op_descs); // take ownership - graph->Set(kGraphNodePool, - new GraphNodePool); // take ownership pass->Erase(kAllOpDescs); pass->SetNotOwned>(kAllOpDescs, all_op_descs); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index e3e06a5614ddee0bea342bc3608691b7a32326cc..e62e3edcef710df739c53b5d848f5aceb4f2db4e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -77,9 +77,6 @@ struct BuildStrategy { bool fuse_relu_depthwise_conv_{false}; bool memory_optimize_{false}; - - bool memory_early_delete_{false}; - // TODO(dzhwinter): // make enable_inplace, memory_optimize_ // memory_early_delete_ true by default diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 601ae4f8c6de11b0bf25d4f9a92ef8eada67be3d..1e3dbb1e44ecb16872e3bf4dee31e31cc69c9818 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -26,7 +26,7 @@ namespace paddle { namespace framework { namespace details { -struct ComputationOpHandle : public OpHandleBase { +class ComputationOpHandle : public OpHandleBase { public: ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index be0d941c4f9c2fe8fbb1da8ec2c11868112fcf9b..6d53dac5c0a20b4340e71274a00a7f3c0cd08ff6 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ->Var(details::kLocalExecScopeName) ->GetMutable() = &local_scope; for (size_t j = 0; j < input_scope_idxes.size(); ++j) { - local_scope.Var("out_var" + j); - if (i == j) local_scope.Var("in_var" + j); + local_scope.Var("out_var" + std::to_string(j)); + if (i == j) local_scope.Var("in_var" + std::to_string(j)); } param_scopes_.emplace_back(&local_scope); } @@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { for (size_t i = 0; i < input_scope_idxes.size(); ++i) { // add input var handle - nodes_.emplace_back( - ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable)); - VarHandle* in_var_handle = - new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i], - "in_var" + i, place_list_[input_scope_idxes[i]]); + nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i), + ir::Node::Type::kVariable)); + VarHandle* in_var_handle = new VarHandle( + nodes_.back().get(), 1, input_scope_idxes[i], + "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); // add output var handle for (size_t j = 0; j < place_list_.size(); ++j) { - nodes_.emplace_back( - ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable)); - VarHandle* out_var_handle = new VarHandle( - nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]); + nodes_.emplace_back(ir::CreateNodeForTest( + "out_node" + std::to_string(i), ir::Node::Type::kVariable)); + VarHandle* out_var_handle = + new VarHandle(nodes_.back().get(), 2, j, + "out_var" + std::to_string(i), place_list_[j]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); } @@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { std::vector> send_vec; f::LoD lod{{0, 10, 20}}; for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string varname("in_var" + i); + const std::string varname("in_var" + std::to_string(i)); float val_scalar = static_cast(i); send_vec.push_back( InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); @@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { WaitAll(); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string& varname("out_var" + i); + const std::string& varname("out_var" + std::to_string(i)); for (size_t j = 0; j < place_list_.size(); ++j) { LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); } @@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; int height = static_cast(kDims[0] * 2); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string varname("in_var" + i); + const std::string varname("in_var" + std::to_string(i)); float val_scalar = static_cast(i); send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], rows, height, val_scalar)); @@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { WaitAll(); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string& varname("out_var" + i); + const std::string& varname("out_var" + std::to_string(i)); for (size_t j = 0; j < place_list_.size(); ++j) { SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, height); diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 78c5d5b50e606daa963e728355dc1bce83cd5484..b0c5968499be3a959dd6103424f25056a6dc2282 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var, } } -const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, - const std::string& cache_var, - const size_t& idx, - ir::Graph* graph) const { +const NodeSwapQueue InplacePass::TryInplaceModifyVar( + const std::string& var, const std::string& cache_var, const size_t& idx, + ir::Graph* graph) const { PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && var_nodes_[var].at(0)->Var() != nullptr); std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); var_desc->SetName(cache_var); - SSANodePair swap_nodes; + NodeSwapQueue swap_nodes; for (size_t i = idx; i < view_.AllOps().size(); ++i) { auto* op = view_.AllOps()[i]; @@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, return swap_nodes; } -void InplacePass::CommitModify(const SSANodePair& swap_nodes, +void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes, ir::Graph* graph) const { for (auto& pair : swap_nodes) { auto *node = pair.first, *cache_node = pair.second; @@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes, } } -void InplacePass::WithdrawModify(const SSANodePair& nodes, +void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const { for (auto& pair : nodes) { auto *node = pair.first, *cache_node = pair.second; diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h index 1abcf1f279e225839d440ff9c6840ce9b8a6547f..7be7f311852d2b64ce95e1a939371760d03d296b 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.h +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -56,7 +56,8 @@ class GraphView { std::map> adj_list_; }; -typedef std::vector> SSANodePair; +// swap pairs in sequence +typedef std::vector> NodeSwapQueue; class InplacePass : public ir::Pass { public: InplacePass(); @@ -68,14 +69,14 @@ class InplacePass : public ir::Pass { void InitSSAGraphNodes() const; private: - const SSANodePair TryInplaceModifyVar(const std::string& var, - const std::string& cache_var, - const size_t& idx, - ir::Graph* graph) const; + const NodeSwapQueue TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const; - void CommitModify(const SSANodePair&, ir::Graph* graph) const; + void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const; - void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; + void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const; void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, const size_t& idx) const; diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc deleted file mode 100644 index 69f8f705484450b0544291b19027eb174d7eeb8f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/memory_early_delete_pass.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_early_delete_pass.h" -#include -#include -#include -#include "paddle/fluid/framework/details/memory_optimize_helper.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/details/reference_count_pass_helper.h" -#include "paddle/fluid/framework/ir/graph_helper.h" - -namespace paddle { -namespace framework { -namespace details { - -static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { - std::queue queue; - queue.push(var_in); - do { - auto* var = queue.front(); - queue.pop(); - for (auto* op : var->PendingOps()) { - auto* compute_op = dynamic_cast(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) { - return compute_op; - } - for (auto* out_var : op->Outputs()) { - queue.push(out_var); - } - } - } while (!queue.empty()); - return nullptr; -} - -std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( - std::unique_ptr graph) const { - auto& graph_pool = Get(kGraphNodePool); - auto& gcs = Get(kGarbageCollector); - - std::unordered_map> unlived_vars; - unlived_vars.reserve(graph_pool.size()); - for (auto& pair : graph_pool) { - unlived_vars.insert(std::make_pair(pair.first, pair.second)); - } - - auto compare_and_insert_early_delete_op = [&]( - OpHandleBase* op, const std::vector& vars) { - if (unlived_vars.empty()) return; - // unlived vars can be deleted after the last used op has finished. - auto* compute_op = dynamic_cast(op); - const auto& places = Get>(kAllPlaces); - for (auto& var : vars) { - auto* var_handle = dynamic_cast(var); - auto var_name = var->Node()->Name(); - auto& var_place = var_handle->place(); - if (unlived_vars.count(var_name) == 0) continue; - if (!unlived_vars[var_name].empty()) { - if (compute_op != nullptr && - unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { - unlived_vars[var_name].erase(compute_op->Node()->Op()); - } - continue; - } - - if (var_handle == nullptr || !var_handle->Node()->IsVar() || - var_handle->Node()->IsCtrlVar()) - continue; - - // shameless copyed from reference count pass. - if (compute_op == nullptr) { - // use next computation op scope - compute_op = FindNextComputationOpHandle(var_handle); - } - auto* early_delete_node = - graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); - GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); - auto* early_delete_handle = new EarlyDeleteOpHandle( - early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); - if (compute_op->Outputs().empty()) { - auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - compute_op->AddOutput(dep_var); - graph->Get(kGraphDepVars).emplace(dep_var); - } - early_delete_handle->AddInput(compute_op->Outputs().front()); - VLOG(5) << "Add early delete op " << var_name << " to Operator" - << compute_op->Name(); - } - }; - - auto all_ops = ir::FilterByNodeWrapper(*graph); - for (auto& op : all_ops) { - compare_and_insert_early_delete_op(op, op->Inputs()); - compare_and_insert_early_delete_op(op, op->Outputs()); - } - return graph; -} - -} // namespace details -} // namespace framework -} // namespace paddle - -REGISTER_PASS(memory_early_delete_pass, - paddle::framework::details::MemoryEarlyDeletePass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) - .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h deleted file mode 100644 index 8215aa1b2baa223a111f9050d5488c5fc8ac0e6e..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/memory_early_delete_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/details/early_delete_op_handle.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -class MemoryEarlyDeletePass : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index b56ef021ef508a43aac082acbcfa6f543635203e..6345ba335997ec42ebc63f90e9bf6a3ed2648edc 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -13,17 +13,108 @@ // limitations under the License. #include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include #include #include #include #include +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { namespace details { +using paddle::framework::VarDesc; -size_t NodeSizeInBytes(const VarDesc& node) { +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +size_t NodeSize(const VarDesc& node) { auto shape = node.GetShape(); int size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); @@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) { return type_size * std::abs(size); } -size_t NodeSizeInBytes(ir::Node* n) { +size_t NodeSize(ir::Node* n) { auto* desc = FindVarDescInBlock(n); - return NodeSizeInBytes(*desc); + return NodeSize(*desc); } std::string DebugStringImpl(VarDesc* var) { @@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) { std::string DebugString(ir::Node* var) { return DebugStringImpl(FindVarDescInBlock(var)); } -// return DebugString(var->Var()); } // NOTE(dzh): based ir node, if a large node has been reused // by a small size node, then next time it appear in pool, it will @@ -80,18 +170,17 @@ struct NodeComparator { auto rhs_shape = rhs_desc->GetShape(); if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + return NodeSize(lhs) <= NodeSize(rhs); } else { return false; } } }; -void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { +void OrderedSet::Insert(ir::Node* var) { PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); - PADDLE_ENFORCE(op->IsOp()); if (mark_table_.count(var->Name()) != 0) { - mark_table_[var->Name()]->second.insert(op); + mark_table_[var->Name()]->emplace_back(var); return; } @@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { auto var_shape = var_desc->GetShape(); int batch_size = static_cast(var_shape[0]); - NodeComparator compare_node; + NodeComparator functor; Iter it = nodes_.begin(); while (it != nodes_.end()) { - auto* cache_desc = FindVarDescInBlock(it->first); + auto& prev = it->front(); + auto* cache_desc = FindVarDescInBlock(prev); int cache_batch_size = cache_desc->GetShape()[0]; if ((cache_batch_size == -1 && batch_size == -1) || (cache_batch_size != -1 && batch_size != -1)) { - if (compare_node(it->first, var)) { + if (functor(prev, var)) { ++it; } else { break; @@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { } } - it = - nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + it = nodes_.insert(it, {var}); mark_table_[var->Name()] = it; } -int OrderedNodeList::GetIndex(ir::Node* var) { +int OrderedSet::GetNodeIndexInPool(ir::Node* var) { return std::distance(nodes_.begin(), mark_table_[var->Name()]); } -ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { +ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { ir::Node* found_node = nullptr; - NodeComparator compare_node; + NodeComparator functor; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - if (compare_node(var, it->first)) { - found_node = it->first; + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; break; } } return found_node; } -void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } +bool OrderedSet::Has(ir::Node* var) const { + if (mark_table_.count(var->Name())) { + auto& node_in_samename = mark_table_.at(var->Name()); + auto iter = + std::find_if(node_in_samename->begin(), node_in_samename->end(), + [&](ir::Node* n) { return n->Name() == var->Name(); }); + return iter != node_in_samename->end(); + } + return false; +} -void OrderedNodeList::Erase(const std::string& var) { - PADDLE_ENFORCE(mark_table_.count(var)); - nodes_.erase(mark_table_[var]); - mark_table_.erase(var); +void OrderedSet::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); } -std::string OrderedNodeList::ToString() const { +std::string OrderedSet::ToString() const { std::stringstream ss; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - ss << DebugString(it->first) << " "; + for (auto& node : *it) { + ss << DebugString(node) << " "; + } } return ss.str(); } bool NodeCanReused(ir::Node* node) { + // valid the node is a var node if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; - // auto* desc = node->Var(); - bool flag = NodeCanReused(*node->Var()); + + bool flag = true; + // op output force generated in cpu, can not be reused. for (auto* op : node->inputs) { if (op->Op()->HasAttr("force_cpu")) { - // op output force generated in cpu, can not be reused. flag &= framework::AttrReader(op->Op()->GetAttrMap()) .Get("force_cpu") == 0; } } + // var desc validation. + flag &= NodeCanReused(*node->Var()); return flag; } bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); - if (node.Persistable() || type != proto::VarType::LOD_TENSOR || - node.GetShape().empty()) { + if (!(type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY)) { + return false; + } + if (node.Persistable() || node.GetShape().empty()) { return false; } // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad @@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) { return false; } +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index 064183d61ea7386b6b45034c90fd7569a8647f60..0bfaf827fea84030de48a9984197f5b39f5c9261 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,41 +29,41 @@ namespace paddle { namespace framework { namespace details { -constexpr char kFetchedVars[] = "fetched_vars"; -constexpr char kGraphNodePool[] = "graph_node_pool"; +constexpr char kAllOpDescs[] = "all_op_descs"; -// NOTE(dzh): Variable and the operators use the var. -// for early delete pass. -// Because analysis var pass build base on ir::Node, which maybe released -// or modified between passes, so we use OpDesc* to mark ops. -using GraphNodePool = std::vector< - std::pair /* ops */>>; +std::vector SortOpLikeDescOrder(const ir::Graph& graph); -// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). -// in fluid, -1 means the batch_size is determined in runtime. -// the node batch_size equal -1 always ranking in the front than the node not. +// NOTE(dzh): A ordered set for node reuse in memory optimize. +// the orderedset sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size, which is determined in runtime. +// So the reuse happens between nodes who's batch_size both are -1 +// simultaneously or not. +// +// sort rule: +// rule 0 : smaller node ranking in front. +// rule 1 : batch_size equal -1 ranking in the front than the node not. +// // For example, // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. -// O(1) insert, delete -class OrderedNodeList { - public: - using NodePair = std::pair>; - using Iter = typename std::list::iterator; - using ConstIter = typename std::list::const_iterator; - void Insert(ir::Node* var, ir::Node* op); +class OrderedSet { + public: + // nodes with same name exists in pool. + using NodeVector = std::vector; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + void Insert(ir::Node* var); void Erase(ir::Node* var); - - void Erase(const std::string& var); - - bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } - - bool Has(const std::string& var) { return mark_table_.count(var); } - - ir::Node* NodeMatch(ir::Node* var) const; + bool Has(ir::Node* var) const; + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + // find the bestfit shape node block with var. + ir::Node* FindBestFitNode(ir::Node* var) const; // map store non-const iterator, can not promise const - int GetIndex(ir::Node* var); + int GetNodeIndexInPool(ir::Node* var); // pool all node to string std::string ToString() const; @@ -69,18 +71,54 @@ class OrderedNodeList { Iter end() { return nodes_.end(); } ConstIter begin() const { return nodes_.begin(); } ConstIter end() const { return nodes_.end(); } - size_t size() const { return nodes_.size(); } - void Clear() { - mark_table_.clear(); - nodes_.clear(); - } + size_t size() const { return nodes_.size(); } private: // for searching. std::unordered_map mark_table_; - // node swap pairs. var -> ops dep var - std::list nodes_; + // node pool + std::list nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // IR Graph + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort }; // valid a tensor can be reuse or not @@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node); bool OpHasSubBlock(OpDesc* desc); // node memory size in bytes -size_t NodeSizeInBytes(ir::Node* n); +size_t NodeSize(ir::Node* n); // node memory size in bytes -size_t NodeSizeInBytes(const VarDesc&); +size_t NodeSize(const VarDesc&); std::string DebugString(ir::Node* var); +// NOTE(dzhwinter) +// after node reuse, the replaced node shape is +// different with its VarDesc. So need to find the +// correct VarDesc in Block. VarDesc* FindVarDescInBlock(ir::Node* n); +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + template class FilterVariableImpl { public: diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc index f2b9baf14a34ace9cc860797280dbd519dfa4f2a..5c13dda9e5491044d2bcbed4b24d438cc8b8a413 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include #include +#include #include #include #include @@ -22,13 +23,19 @@ #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" namespace paddle { namespace framework { namespace details { -TEST(OrderedNodeList, Normal) { - OrderedNodeList pool; +TEST(OrderedSet, Normal) { + OrderedSet pool; std::vector> nodes; // clang-format off @@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) { nodes.emplace_back(std::move(node)); } + // Insert for (auto& node : nodes) { - pool.Insert(node.get(), op.get()); + pool.Insert(node.get()); + } + + // Has/size + ASSERT_EQ(pool.size(), shapes.size()); + for (auto& node : nodes) { + ASSERT_TRUE(pool.Has(node.get())); } // assert its order and interface. @@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) { std::cout << pool.ToString() << std::endl; ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); - ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); { auto v1 = block_desc->Var("11"); v1->SetShape({-1, 256, 56, 56}); std::unique_ptr node1 = ir::CreateNodeForTest(v1); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); + auto* cache = pool.FindBestFitNode(node1.get()); ASSERT_EQ(cache, nullptr); } { @@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) { v2->SetShape({-1, 2, 5}); std::unique_ptr node1 = ir::CreateNodeForTest(v2); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] } { auto v3 = block_desc->Var("13"); v3->SetShape({2, 5}); std::unique_ptr node1 = ir::CreateNodeForTest(v3); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] + } +} +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); } } diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 85de14a60a8fe6958794f0ac25768b9da1943f9d..41e4a834df0abab069ab1f6cd21c8479b911250c 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -43,11 +43,6 @@ namespace paddle { namespace framework { namespace details { -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); @@ -77,7 +72,7 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || skip_set_.count(var->Name())) continue; - ir::Node* cache = pool_.NodeMatch(var); + ir::Node* cache = pool_.FindBestFitNode(var); if (var->Name() == FLAGS_memory_optimize_debug) { VLOG(3) << "start match var " << DebugString(var) << " of op " @@ -95,11 +90,12 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( << "replace it again. Skip this candidate."; continue; - int node_idx_in_pool = pool_.GetIndex(cache); + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); VLOG(3) << string::Sprintf( "!!! %s, %s => %s, cache idx %d, pool size %d", std::to_string(reuse_id++), DebugString(var), DebugString(cache), node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. // reused var maybe re-fill into the pool cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); @@ -112,6 +108,7 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( pool_.Erase(cache); } + // fill the pool std::unordered_set unlived_vars; for (auto var : cfg_->LiveIn(op)) { @@ -120,36 +117,15 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } } for (auto var : unlived_vars) { - ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + ir::Node* var_node = cfg_->GetNodeByName(var, op); if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node, op); + pool_.Insert(var_node); } } } } graph->ResolveHazard(var_nodes_); - // For early delete pass. use GraphNodePool load the unlived vars. - // 1. find all deps op for each unlived var in memory pool. - for (auto& op : graph->Nodes()) { - for (auto& var : op->inputs) { - if (pool_.Has(var)) { - pool_.Insert(var, op); - } - } - } - // 2. convert ir node based memory pool to graph node - // because Node* maybe released bettwen passes. - auto& graph_pool = graph->Get(kGraphNodePool); - for (auto it = pool_.begin(); it != pool_.end(); ++it) { - std::unordered_set descs; - for (auto& op : it->second) { - PADDLE_ENFORCE(op->IsOp()); - descs.insert(op->Op()); - } - graph_pool.push_back(std::make_pair(it->first->Name(), descs)); - } - return graph; } @@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { PADDLE_ENFORCE(sub_op != nullptr); for (auto* var : sub_op->outputs) { if (NodeCanReused(var)) { - ir::Node* cache = pool_.NodeMatch(var); + ir::Node* cache = pool_.FindBestFitNode(var); if (cache != nullptr) { if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { continue; } - int node_idx_in_pool = pool_.GetIndex(cache); + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); VLOG(3) << string::Sprintf( "!!! %s, %s => %s, cache idx %d, pool size %d", std::to_string(sub_reuse_id++), DebugString(var), @@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, var_nodes_.at(var).clear(); } -std::vector SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(kAllOpDescs), - "Graph has no attribute of kAllOpDescs."); - // 1. get op desc order - auto& op_descs = graph.Get>(kAllOpDescs); - - // 2. topology sort order - auto nodes = graph.Nodes(); - std::deque ops; - FilterVariables(nodes, [&](ir::Node* op) { - if (op->IsOp() && op->Op() != nullptr) { - ops.emplace_back(op); - } - }); - std::unordered_map op_deps; - std::list ready_ops; - std::unordered_map> pending_ops; - - for (auto* op : ops) { - std::unordered_set preceding_op; - for (auto* in : op->inputs) { - if (in->inputs.empty()) continue; - PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); - preceding_op.emplace(in->inputs[0]); - pending_ops[in->inputs[0]].emplace(op); - } - op_deps[op] = preceding_op.size(); - if (preceding_op.empty()) { - ready_ops.emplace_back(op); - } - } - - // 3. generated op list based desc order and the topology order - std::vector ret; - std::list op_descs_list(op_descs.begin(), op_descs.end()); - - auto update_by_found_node = [&](ir::Node* found_node) { - for (auto* pending_op : pending_ops[found_node]) { - if (--op_deps[pending_op] == 0) { - ready_ops.emplace_back(pending_op); - } - } - ready_ops.remove(found_node); - ret.emplace_back(found_node); - }; - - while (!ready_ops.empty()) { - bool all_of_ready_op_unmatched = true; - for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { - auto op_desc = *it; - ir::Node* found_node = nullptr; - for (auto* op : ready_ops) { - if (IsSameDesc(op->Op(), op_desc)) { - found_node = op; - break; - } - } - - // 3.1 op desc deleted by other pass - if (found_node == nullptr) { - ++it; - continue; - } else { - all_of_ready_op_unmatched = false; - it = op_descs_list.erase(it); - } - update_by_found_node(found_node); - } - - // 3.2 op descs are added by other pass - // preceding op non empty means some new op descs are - // created, but not contained in return node list. - // these new op desc may depend on each other. - std::list prev_ready_ops(ready_ops); - if (all_of_ready_op_unmatched) { - for (auto op : prev_ready_ops) { - update_by_found_node(op); - } - } - } - - PADDLE_ENFORCE(std::all_of( - op_deps.begin(), op_deps.end(), - [&](const std::pair& p) { return p.second == 0; })); - - return ret; -} - -ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { - ops_ = SortOpLikeDescOrder(graph); - ConnectNodes(); -} - -void ControlFlowGraph::BuildCFGGraph() { - // FIXME(dzh): same effect with ConnectNodes, but use the control - // link to build dependency graph, it goes wrong in transformer. - for (ir::Node* op : ops_) { - for (auto& input_var : op->inputs) { - if (!input_var->inputs.empty()) { - PADDLE_ENFORCE( - input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), - "Preceding Op Node of Var Node must be unique"); - auto* pred_op = input_var->inputs[0]; - if (pred_op->Op() != nullptr) { - predecessors_[op].insert(pred_op); - successors_[pred_op].insert(op); - } - } - if (input_var->IsVar() && !input_var->IsCtrlVar()) { - uses_[op].insert(input_var->Name()); - } - } - for (auto& output_var : op->outputs) { - // output var may be used by many op - for (auto* succ_op : output_var->outputs) { - if (succ_op->Op() != nullptr) { - successors_[op].insert(succ_op); - predecessors_[succ_op].insert(op); - } - } - if (output_var->IsVar() && !output_var->IsCtrlVar()) { - defs_[op].insert(output_var->Name()); - } - } - } -} - -void ControlFlowGraph::ConnectNodes() { - for (size_t i = 0; i < ops_.size(); ++i) { - auto& op = ops_[i]; - try { - auto& next_op = ops_.at(i + 1); - successors_[op].insert(next_op); - predecessors_[next_op].insert(op); - } catch (...) { - // do nothing - } - - FilterVariables(op->inputs, - [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); - - FilterVariables(op->outputs, - [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); - } -} - -void ControlFlowGraph::LiveVariableAnalysis() { - // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) - // compute the liveness of for each variable though reversed_ops algorithm. - // It iterates the operators from end to begin, compute the live in/live out - // variable set for each op, then the diff between in/out will be used for - // the variable reuse. For detail refer to - // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf - std::list work_list(ops_.rbegin(), ops_.rend()); - while (!work_list.empty()) { - ir::Node* op = work_list.front(); - work_list.pop_front(); - // get the live_in calculated before. Empty if first. - auto prev_live_in = std::move(live_in_[op]); - for (auto& s : successors_[op]) { - for (auto& var : live_in_[s]) { - live_out_[op].insert(var); - } - } - for (auto& var : uses_[op]) { - live_in_[op].insert(var); - } - for (auto& var : live_out_[op]) { - live_in_[op].insert(var); - } - for (auto& var : defs_[op]) { - live_in_[op].erase(var); - } - - // If the live_in is not changed, then the liveness analysis of - // predecessors is completed. - // - // Otherwise, recalculate the predecessors liveness - if (live_in_[op] != prev_live_in) { - for (auto& pre : predecessors_[op]) { - work_list.push_back(pre); - } - } - } -} - -void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, - int begin_idx) { - // update graph from begin idx to the end - for (size_t i = begin_idx; i != ops_.size(); ++i) { - auto* op = ops_[i]; - if (uses_[op].find(old_node) != uses_[op].end()) { - uses_[op].erase(old_node); - uses_[op].insert(new_node); - } - if (defs_[op].find(old_node) != defs_[op].end()) { - defs_[op].erase(old_node); - defs_[op].insert(new_node); - } - if (live_in_[op].find(old_node) != live_in_[op].end()) { - live_in_[op].erase(old_node); - live_in_[op].insert(new_node); - } - if (live_out_[op].find(old_node) != live_out_[op].end()) { - live_out_[op].erase(old_node); - live_out_[op].insert(new_node); - } - } -} - -const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { - auto it = live_in_.find(op); - PADDLE_ENFORCE( - it != live_in_.end(), - string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); - return it->second; -} - -const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { - auto it = live_out_.find(op); - PADDLE_ENFORCE( - it != live_out_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::set ControlFlowGraph::Use(ir::Node* op) const { - auto it = uses_.find(op); - PADDLE_ENFORCE( - it != uses_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::vector ControlFlowGraph::Ops() const { return ops_; } - -std::vector& ControlFlowGraph::Ops() { return ops_; } - -ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, - ir::Node* op) const { - // in ssa-graph, different version nodes have same name, - // this function get the latest version var before target op - // It may return nullptr, such as data node. - ir::Node* found_node = nullptr; - for (auto* node : ops_) { - if (node == op) break; - for (auto& output : node->outputs) { - if (output->Name() == name) { - found_node = output; - } - } - } - return found_node; -} - } // namespace details } // namespace framework } // namespace paddle REGISTER_PASS(memory_optimize_pass, paddle::framework::details::MemoryOptimizePass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index 3d6b1897f3b5106054b8f647f9cf613ebd1d65ff..593ffc10fc99d26b1ee9174ceef081581126e7e8 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -32,20 +32,15 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph); - -class ControlFlowGraph; class MemoryOptimizePass : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - - private: // fill the variable map(var_nodes) by version. void InitSSAGraphNodes() const; + + private: // update program descs void RenameVarInGraphDesc(const std::string& var, const std::string& cache_var, size_t idx) const; @@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass { private: // Reuse Node Pool, Owned. - mutable OrderedNodeList pool_; + mutable OrderedSet pool_; // controlflow Graph mutable std::unique_ptr cfg_; // skip set @@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass { mutable std::map> var_nodes_; }; -class ControlFlowGraph { - public: - ControlFlowGraph() = default; - // For IR Graph in parallelexecutor - explicit ControlFlowGraph(const ir::Graph& graph); - - void LiveVariableAnalysis(); - - void RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, int begin_idx); - - const std::set LiveIn(ir::Node* op) const; - const std::set LiveOut(ir::Node* op) const; - const std::set Use(ir::Node* op) const; - const std::vector Ops() const; - std::vector& Ops(); - - // for ssa-graph nodes - ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; - - private: - void BuildCFGGraph(); - void ConnectNodes(); - using NodeListMap = std::unordered_map>; - using VarSetMap = std::map>; - // successors ops use the output variables. - NodeListMap successors_; - // predecessors ops generated input variables. - NodeListMap predecessors_; - // variables lived before run current op. - VarSetMap live_in_; - // variables lived after run current op. - VarSetMap live_out_; - VarSetMap uses_; // op inputs - VarSetMap defs_; // op outputs - - std::vector ops_; // op sequence by topology sort -}; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_pass_test.cc deleted file mode 100644 index 3d3dfa93594d496431f7cb60dceb26f20250fc16..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/memory_optimize_pass_test.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_optimize_pass.h" -#include -#include -#include -#include "glog/logging.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/details/graph_test_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" - -REGISTER_OPERATOR(sum, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(assign, paddle::framework::DummyOp, - paddle::framework::AssignOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -/* - https://en.wikipedia.org/wiki/Live_variable_analysis - Create a customed classical dependency graph, left row is the instruction - number. - 1. a = 1 - 2. b = a - 3. c = a - 4. d = b + c - 5. e = d - - a--------+ - | | - b c - | | - d--------+ - | - e - Then analysis these variable's liveness range - */ - -namespace paddle { -namespace framework { -namespace details { - -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - -inline static ProgramDesc FillProgramDesc() { - ProgramDesc prog; - prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"b"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"c"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"d"}); - op->SetOutput("Out", {"e"}); - } - return prog; -} - -TEST(CFGGraph, IRGraph) { - // prepare ir graph - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - ControlFlowGraph cfg(graph); - cfg.LiveVariableAnalysis(); - - // test assign op - ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); - - // test assign op - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); - - // test sum op - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); - ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); - - // test assign op - ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); - ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); -} - -// 1. normal test -TEST(SortOpLikeDescOrder, NormalTest) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto nodes = SortOpLikeDescOrder(graph); - auto op_descs = prog.Block(0).AllOps(); - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 2. remove some op_desc -TEST(SortOpLikeDescOrder, RemoveOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - auto nodes = graph.Nodes(); - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->outputs.back()->Name() == "e") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - graph.RemoveNode(found_node); - graph.RemoveNode(e); - - // other node keeps the same order - auto remain_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < remain_nodes.size(); ++i) { - auto node = remain_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 3. add some op_desc -TEST(SortOpLikeDescOrder, AddOpDesc) { - auto prog = FillProgramDesc(); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // cached desc different with real one - // mimic the intermidiete pass modify the programdesc. - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto op_descs = prog.Block(0).AllOps(); - - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - op_descs.insert(op_descs.begin() + 4, op); - - auto nodes = SortOpLikeDescOrder(graph); - - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 4. add and delete some op_desc -TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // remove sum node - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - auto nodes = graph.Nodes(); - for (auto node : nodes) { - if (node->Name() == "sum") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* c = find_node_in_graph("c"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(c->outputs.begin(), c->outputs.end(), found_node); - ir::Node* pending_op = found_node->outputs[0]->outputs[0]; - graph.RemoveNode(e); - graph.RemoveNode(pending_op); - graph.RemoveNode(found_node); - } - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - op_descs.insert(op_descs.begin() + 2, op); - - // check the order - auto mynodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < mynodes.size(); ++i) { - auto node = mynodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 5. add and replace some op_desc inplace. -TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - auto op_descs = prog.Block(0).AllOps(); - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - - op_descs.emplace_back(op); - - // replace op_desc inplace - auto nodes = graph.Nodes(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->Op() && node->Name() == "assign") { - if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { - found_node = node; - break; - } - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(e->inputs.begin(), e->inputs.end(), found_node); - graph.RemoveNode(found_node); - } - op_descs.erase(op_descs.begin() + 3); - - auto replace_op = prog.MutableBlock(0)->AppendOp(); - replace_op->SetType("sum"); - replace_op->SetInput("X", {"d", "d1"}); - replace_op->SetOutput("Out", {"e"}); - { - ir::Node* sum2 = graph.CreateOpNode(replace_op); - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - ir::Node* d1 = find_node_in_graph("d1"); - sum2->inputs.emplace_back(d); - sum2->inputs.emplace_back(d1); - sum2->outputs.emplace_back(e); - e->inputs.emplace_back(sum2); - d->outputs.emplace_back(sum2); - d1->outputs.emplace_back(sum2); - } - - op_descs.emplace_back(replace_op); - // compare op order - auto graph_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < graph_nodes.size(); ++i) { - auto node = graph_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 128aaa33a2c60e62fdca13768cdc0a815167f3ef..e8deb5bfc6c013addce11e2a04286a828acc1930 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - fetch_data.emplace_back(std::move(call())); + fetch_data.emplace_back(call()); } } @@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (exception_holder_.IsCaught()) { f.wait(); } else { - fetch_data.emplace_back(std::move(f.get())); + fetch_data.emplace_back(f.get()); } } } diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index cc2c8bfef9f9f54c2e499467df0d22ce3f69d6b8..879fb29d5926941e574d0080051c195293bc60a9 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h index a04c08bc2eb3bae797d648b30a22a5fee7ba0eaa..ea3034877fcea80de0124df64d8d23028bdcb7b3 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.h +++ b/paddle/fluid/framework/details/sequential_execution_pass.h @@ -21,8 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - class SequentialExecutionPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 6338be75a4b1d3c4caf7a6f7add4d05fec690340..96530b2a3f9cfd9462627a42b2bb0fea98758f92 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, // Since we want to fetch LodTensor from a variable, the variable must // be created alreadly. Variable* g_fetch_value = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name); PADDLE_ENFORCE(g_fetch_value->IsType(), "Only %s can be invoked by GetFetchVariable", typeid(FeedFetchList).name()); diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index 03ab2a2b6c5dc07805fddddc3ac53f61e7b6a697..a3ccf677c90e8466f6c89041979336d45c1ac942 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference { bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { return in.Name() != out.Name() && details::NodeCanReused(in) && details::NodeCanReused(out) && - details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); + details::NodeSize(out) <= details::NodeSize(in); } }; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 3eb5bdba3b7275f45cdfc6ad47f75e7a423541d0..4b5c846f3271b2dd5e094020571069aff590cd2b 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -76,7 +76,7 @@ std::map> Graph::InitFromProgram( var->inputs.push_back(node); } } - return std::move(var_nodes); + return var_nodes; } void Graph::ResolveHazard( diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index b7f7c3d82e0da4d3ca8795487fa52fba0394e365..feb3330176490e71c51cce826fe49d5499469ad7 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -142,7 +142,7 @@ class Graph { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( "%s@%llu", static_cast(ir::Node::kControlDepVarName), - node_set_.size()); + num_node_created_); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); x->SetId(num_node_created_++); return x; diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index 7713ed1eab88ee4fa16d52e7425075ae66f721a3..6607c026a748576f38419b275d71217f3eee0c59 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase { std::unordered_set invalid_nodes; int valid_op = 0; for (auto* node : graph->Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node); if (is_valid_node(node)) { invalid_nodes.insert(node); } else if (node->IsOp()) { diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc index 456a03192cc4e4a9d0dbe2dcb649b6c1b4d9cd5a..35d1d5129bba7043026e5489b806480775473257 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) { }; std::vector concat_inputs; for (int i = 0; i < num_inputs_of_concat; ++i) { - std::string prefix = "seqpool_op_" + i; + std::string prefix = "seqpool_op_" + std::to_string(i); new_var(prefix + "in"); new_var(prefix + "out"); new_var(prefix + "out_unused"); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9d6c10ab9e33d0e9888fa484030be9da7752512e..e15c838f4fbe44fa4f0b543021e97b6b6c70e757 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -188,14 +188,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(3) << place << " " << DebugStringEx(&scope); } catch (platform::EnforceNotMet exception) { if (Attrs().count("sub_block") != 0) { - throw exception; + throw; } auto& callstack = Attr>( OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); if (callstack.empty()) { - throw exception; + throw; } std::ostringstream sout; sout << "Invoke operator " << Type() << " error.\n"; @@ -206,7 +206,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { sout << "C++ Callstacks: \n"; sout << exception.err_str_; exception.err_str_ = sout.str(); - throw exception; + throw; } catch (...) { std::rethrow_exception(std::current_exception()); } @@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext { public: RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, const RuntimeContext& ctx) - : op_(op), scope_(scope), ctx_(ctx) {} + : op_(op), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input @@ -881,7 +881,6 @@ class RuntimeInferShapeContext : public InferShapeContext { } const OperatorBase& op_; - const Scope& scope_; const RuntimeContext& ctx_; }; @@ -990,11 +989,14 @@ void OperatorWithKernel::TransferInplaceVarsBack( const Scope& transfer_scope) const { for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; + auto* origin_var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.", + var_name); auto* original_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name)); + GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var); auto* var = transfer_scope.FindVar(var_name); - PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr", - var_name); + PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.", + var_name); auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); original_tensor->ShareDataWith(*transformed_tensor); } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 40d935a5ff98a28b376a59b733e2929e4a128cb9..e33214b44bb5d8ea5eb32d442d597a369c198bdd 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -222,12 +222,7 @@ class ExecutionContext { if (it == ctx_.inputs.end()) { return {}; } - std::vector res; - res.reserve(it->second.size()); - std::transform(it->second.begin(), it->second.end(), - std::back_inserter(res), - [this](Variable* var) { return var; }); - return res; + return {it->second.begin(), it->second.end()}; } std::vector MultiOutputVar(const std::string& name) const { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f61c9e3a91146704faa6c5b1058137bef67d2a3e..ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -171,14 +171,6 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - if (build_strategy_.memory_early_delete_) { - auto early_delete_pass = - ir::PassRegistry::Instance().Get("memory_early_delete_pass"); - early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = early_delete_pass->Apply(std::move(graph)); - } - VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); + VLOG(10) << "Eager Deletion Threshold " + << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { for (size_t i = 0; i < graphs.size(); ++i) { graphs[i] = member_->PrepareGCAndRefCnts( @@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle -USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 953618560913229cd1e47659ad61e621efc10ed1..87f0f307d30bc90a43a698c3766b16c975f0635e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -22,11 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); +DECLARE_bool(benchmark); DEFINE_bool( eager_delete_scope, true, diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 5db422119966948f75970874e13d416ea699158a..ec8dedd605235a2d197e6a313bd589d5b9520cdf 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 11484a647303b32a6006bef3cfe4be6b3f0d533d..157862016e3556902f6507e02417624363ed1029 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -58,12 +58,13 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) - target_link_libraries(paddle_fluid_shared shlwapi) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) endif() +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE AND NOT WIN32) diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 7476c199cfd073ec0962fa9a48f24750a6484bb5..8d5ee36ae627deccd7ddbd4bf8c5354a82c5e9db 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -101,7 +101,7 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { } graph = pass->Apply(std::move(graph)); } - return std::move(graph); + return graph; } framework::proto::ProgramDesc IRPassManager::AcquireProgram( diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index eb6e1768a2c01f1388962eefe8e70368cae8cf8b..410a90132aa7657a23b858570763547fe53730a0 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +if(WITH_TESTING) + add_dependencies(subgraph_detector gtest) +endif() if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index ad0af4005ad154d2f5c67d00dec9d7ec397eb662..85755fc471ae3d37ec5d005882668ccf0c35b354 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index da2e9803f0467f2b83d79cdd06d4317d41630b04..712e010db4340bde55945f89c488ba8cc38a1926 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -421,7 +421,7 @@ std::unique_ptr CreatePaddlePredictor< if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } - return std::move(predictor); + return predictor; } void AnalysisPredictor::PrepareFeedFetch() { diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 8ac8bc529183edc2f8f888ca7ba14611acaadc10..f90a74b9102ee62d15c2d738b53971c9bde51439 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -16,6 +16,12 @@ /*! \file paddle_api.h */ +/*! \mainpage Paddle Inference APIs + * \section intro_sec Introduction + * The Paddle inference library aims to offer an high performance inference SDK + * for Paddle users. + */ + #include #include #include @@ -34,26 +40,49 @@ enum PaddleDType { }; /** - *\brief Memory menager for PaddleTensor. + * \brief Memory manager for `PaddleTensor`. * - *The PaddleBuf holds a buffer for data input or output. The memory can be - *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf - *should be reused for better performance. + * The PaddleBuf holds a buffer for data input or output. The memory can be + * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf + * should be reused for better performance. * - *For user allocated memory, the following API can be used: - *- PaddleBuf(void* data, size_t length) to set an external memory by - *specifying - * the memory address and length. - *- Reset(void* data, size_t length) to reset the PaddleBuf with an external + * For user allocated memory, the following API can be used: + * - PaddleBuf(void* data, size_t length) to set an external memory by + * specifying the memory address and length. + * - Reset(void* data, size_t length) to reset the PaddleBuf with an external *memory. - *ATTENTION, for user allocated memory, deallocation should be done by users + * ATTENTION, for user allocated memory, deallocation should be done by users *externally after the program finished. The PaddleBuf won't do any allocation *or deallocation. * - *To have the PaddleBuf allocate and manage the memory: - *- PaddleBuf(size_t length) will allocate a memory of size `length`. - *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION + * To have the PaddleBuf allocate and manage the memory: + * - PaddleBuf(size_t length) will allocate a memory of size `length`. + * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION * if the allocated memory is larger than `length`, nothing will done. + * + * Usage: + * + * Let PaddleBuf manage the memory internally. + * \code{cpp} + * const int num_elements = 128; + * PaddleBuf buf(num_elements * sizeof(float)); + * \endcode + * + * Or + * \code{cpp} + * PaddleBuf buf; + * buf.Resize(num_elements * sizeof(float)); + * \endcode + * Works the exactly the same. + * + * One can also make the `PaddleBuf` use the external memory. + * \code{cpp} + * PaddleBuf buf; + * void* external_memory = new float[num_elements]; + * buf.Reset(external_memory, num_elements*sizeof(float)); + * ... + * delete[] external_memory; // manage the memory lifetime outside. + * \endcode */ class PaddleBuf { public: @@ -78,7 +107,7 @@ class PaddleBuf { /** Tell whether the buffer is empty. */ bool empty() const { return length_ == 0; } - /** Get the memory address. + /** Get the data's memory address. */ void* data() const { return data_; } /** Get the memory length. @@ -110,7 +139,8 @@ struct PaddleTensor { }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; -/** Tensor without copy, currently only supports AnalysisPredictor. + +/** Tensor without copy, currently only supports `AnalysisPredictor`. */ class ZeroCopyTensor { public: @@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config { * * Usage: * + * \code{.cpp} * NativeConfig config; * ... // change the configs. * auto native_predictor = CreatePaddlePredictor(config); + * \endcode * * FOR EXTENSION DEVELOPER: * Different predictors are designated by config type. Similar configs can be diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 039389a4cf99da6c2576c148d8c294e5d79aa7a8..f9c13c2fa84b3b5d629297d3f44a6f5889a734f4 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } +GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { + passes_.assign({ + "infer_clean_graph_pass", // + "identity_scale_op_clean_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif + }); + + for (int i = 6; i >= 3; i--) { + passes_.push_back("transpose_flatten" + std::to_string(i) + + "_concat_fuse_pass"); + } + use_gpu_ = true; +} + void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { analysis_passes_.push_back(pass); } +CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { + // NOTE the large fusions should be located in the front, so that they will + // not be damaged by smaller ones. + passes_.assign({ + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "seqpool_concat_fuse_pass", // + "seqconv_eltadd_relu_fuse_pass", // + // "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // + "repeated_fc_relu_fuse_pass", // + "squared_mat_sub_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "is_test_pass", // + "identity_scale_op_clean_pass", // + }); + use_gpu_ = false; +} } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index aa353f12ca7333713e2d640cce6b2dfbea3c4e26..2524d89fcd1322e105ad2217347aa2380448f2bc 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder { */ class CpuPassStrategy : public PassStrategy { public: - CpuPassStrategy() : PassStrategy({}) { - // NOTE the large fusions should be located in the front, so that they will - // not be damaged by smaller ones. - passes_.assign({ - "infer_clean_graph_pass", // - "attention_lstm_fuse_pass", // - "seqpool_concat_fuse_pass", // - "seqconv_eltadd_relu_fuse_pass", // - // "embedding_fc_lstm_fuse_pass", // - "fc_lstm_fuse_pass", // - "mul_lstm_fuse_pass", // - "fc_gru_fuse_pass", // - "mul_gru_fuse_pass", // - "seq_concat_fc_fuse_pass", // - "fc_fuse_pass", // - "repeated_fc_relu_fuse_pass", // - "squared_mat_sub_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_eltwiseadd_bn_fuse_pass", // - "is_test_pass", // - "identity_scale_op_clean_pass", // - }); - use_gpu_ = false; - } + CpuPassStrategy(); explicit CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.AllPasses()) {} @@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy { */ class GpuPassStrategy : public PassStrategy { public: - GpuPassStrategy() : PassStrategy({}) { - passes_.assign({ - "infer_clean_graph_pass", // - "identity_scale_op_clean_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // -#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be - // guaranteed at least v7 - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // -#endif - }); - - for (int i = 6; i >= 3; i--) { - passes_.push_back("transpose_flatten" + std::to_string(i) + - "_concat_fuse_pass"); - } - use_gpu_ = true; - } + GpuPassStrategy(); explicit GpuPassStrategy(const GpuPassStrategy &other) : PassStrategy(other.AllPasses()) { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 794d729bdc1adc7eb3fe44ffabfe0cc99719b421..ea0b729dc6f62f517877e060cb0ecbe5c1d22e61 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -83,7 +83,7 @@ class ChunkedAllocator : public Allocator { VLOG(1) << "Create AutoIncrementAllocator with chunk_size " << max_chunk_size_ << " and capacity " << capacity; default_allocator_ = std::make_shared( - [this] { return std::move(CreateAllocatorWithChunk()); }, capacity); + [this] { return CreateAllocatorWithChunk(); }, capacity); } } diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index 6f3e512fb0b68df5e86eba3e50a255c18f75214f..e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -111,6 +111,8 @@ size_t BestFitAllocator::NumFreeChunks() const { } void BestFitAllocator::Free(Allocation* allocation) { auto* bf_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(bf_allocation, + "The input allocation is not BestFitAllocation."); auto chunk_it = bf_allocation->ChunkIterator(); PADDLE_ENFORCE(!chunk_it->is_free); chunk_it->is_free = true; diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 327adcc4aac1c50b51942c557d66dae6770e24f2..e983ae327d69389526ae4cae226b0eb324759700 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_bool(benchmark); namespace paddle { namespace memory { @@ -198,7 +199,7 @@ void *Alloc(const platform::CUDAPlace &place, << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - if (VLOG_IS_ON(3)) { + if (FLAGS_benchmark) { allocation::GPUMemMonitor.Add(place.device, size); } if (FLAGS_init_allocated_mem) { @@ -216,7 +217,7 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - if (VLOG_IS_ON(3)) { + if (FLAGS_benchmark) { allocation::GPUMemMonitor.Minus(place.device, size); } #else @@ -257,7 +258,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, void *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - LOG(WARNING) << "cudaMallocHost Cannot allocate " << size + LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size << " bytes in CUDAPinnedPlace"; } if (FLAGS_init_allocated_mem) { diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 6ac3aefdd18d6d9a21dc7ce66511013dfb78bc5b..de81d12cca6ca280289371abdec225c9e2b8f4d0 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, // "CPUPinnedAllocator should be used for Cross-Device Communication"); void *ptr; - PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); + PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); return new CPUPinnedAllocation(ptr, size); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 26d12dd91c7fda31802226a84d883b6a6e9abbe4..42d0938f2afbb1efca8bfdd7035bc0eada30f06b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -19,7 +19,7 @@ namespace paddle { namespace memory { namespace allocation { -// Allocator uses `cudaMallocHost` +// Allocator uses `cudaHostAlloc` class CPUPinnedAllocation : public Allocation { public: CPUPinnedAllocation(void *ptr, size_t size) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 3e8fb83e9d5ba2078bcf37e4a4af74708df9c11c..197d1c2f21fd818879aafe17599bc87d33caa198 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. - cudaError_t result = cudaMallocHost(&p, size); + cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); if (result == cudaSuccess) { *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; return p; } else { - LOG(WARNING) << "cudaMallocHost failed."; + LOG(WARNING) << "cudaHostAlloc failed."; return nullptr; } diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 189db2317d0544014d9c74e0fd5e9ead54925b9c..65efe2966ce12e86ba7f4944eb57ae72cdf9796f 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -37,7 +37,7 @@ using paddle::framework::Tensor; "(bool, default false) Set to true for inference only, false " \ "for training. Some layers may run faster when this is true.") \ .SetDefault(false); \ - AddComment(#OP_COMMENT); \ + AddComment(OP_COMMENT); \ } \ } @@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel { UNUSED constexpr char SigmoidDoc[] = R"DOC( Sigmoid Activation Operator -$$out = \frac{1}{1 + e^{-x}}$$ +$$out = \\frac{1}{1 + e^{-x}}$$ )DOC"; @@ -187,14 +187,14 @@ $out = |x|$ UNUSED constexpr char CeilDoc[] = R"DOC( Ceil Activation Operator. -$out = ceil(x)$ +$out = \left \lceil x \right \rceil$ )DOC"; UNUSED constexpr char FloorDoc[] = R"DOC( Floor Activation Operator. -$out = floor(x)$ +$out = \left \lfloor x \right \rfloor$ )DOC"; @@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$ UNUSED constexpr char SoftsignDoc[] = R"DOC( Softsign Activation Operator. -$$out = \frac{x}{1 + |x|}$$ +$$out = \\frac{x}{1 + \|x\|}$$ )DOC"; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index bd788f03e7d666aad7ce6f0c63cea30f029e3491..fd9f156d070bdb1990a2fc9c63305933050e5524 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -222,7 +222,7 @@ void Conv2DOpMaker::Make() { .SetDefault(4096); AddAttr("exhaustive_search", "(bool, default false) cuDNN has many algorithm to calculation " - "convolution, whether enable exhaustive search ", + "convolution, whether enable exhaustive search " "for cuDNN convolution or not, defalut is False.") .SetDefault(false); AddComment(R"DOC( @@ -341,7 +341,7 @@ void Conv3DOpMaker::Make() { .SetDefault(4096); AddAttr("exhaustive_search", "(bool, default false) cuDNN has many algorithm to calculation " - "convolution, whether enable exhaustive search ", + "convolution, whether enable exhaustive search " "for cuDNN convolution or not, defalut is False.") .SetDefault(false); AddComment(R"DOC( diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index fdcff62e1fe59b3a2f4925bdff98632f71220abb..0a51d50e06176e713922837861f2102c9ee8a899 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel { "The shape of PriorBox is [N, 4]"); if (ctx->HasInput("PriorBoxVar")) { auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); - PADDLE_ENFORCE( - prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, - "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2."); - if (prior_box_var_dims.size() == 1) { - PADDLE_ENFORCE_EQ( - prior_box_var_dims[0], 4, - "The 1st dimension of Input(PriorBoxVar) should be 4" - "when the rank is 1."); - } else { - PADDLE_ENFORCE_EQ( - prior_box_dims, prior_box_var_dims, - "The dimension of Input(PriorBoxVar) should be equal to" - "the dimension of Input(PriorBox when the rank is 2.)"); - } + PADDLE_ENFORCE(prior_box_var_dims.size() == 2, + "Input(PriorBoxVar) of BoxCoderOp should be 2."); + PADDLE_ENFORCE_EQ( + prior_box_dims, prior_box_var_dims, + "The dimension of Input(PriorBoxVar) should be equal to" + "the dimension of Input(PriorBox) when the rank is 2."); } } diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index e078af3eb478a8bebc6a7fc6460d169d803a3c4b..19a5bb90fa828899ad6270c051090dd3662aeed8 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel( output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = col_idx * len; - } + int prior_var_offset = col_idx * len; output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel( T box_var_x = T(1), box_var_y = T(1); T box_var_w = T(1), box_var_h = T(1); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; - } + int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; box_var_x = prior_box_var_data[prior_var_offset]; box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_w = prior_box_var_data[prior_var_offset + 2]; diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index a0b1faf7bdc7001eba2d92b4d03fbaf9feb7bcbb..6d406f8196f9964c85bb94541fa7a7a23857539b 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel { output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - prior_var_offset = j * len; - } + int prior_var_offset = j * len; output[offset] /= prior_box_var_data[prior_var_offset]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel { } } } + template void DecodeCenterSize(const framework::Tensor* target_box, const framework::Tensor* prior_box, const framework::Tensor* prior_box_var, - const bool normalized, const int axis, - const std::vector variance, T* output) const { + const bool normalized, std::vector variance, + T* output) const { int64_t row = target_box->dims()[0]; int64_t col = target_box->dims()[1]; int64_t len = target_box->dims()[2]; @@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel { auto* target_box_data = target_box->data(); auto* prior_box_data = prior_box->data(); const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); + if (var_size == 2) prior_box_var_data = prior_box_var->data(); int prior_box_offset = 0; + T var_data[4] = {1., 1., 1., 1.}; + T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; - if (axis == 0) { - prior_box_offset = j * len; - } else if (axis == 1) { - prior_box_offset = i * len; - } + prior_box_offset = axis == 0 ? j * len : i * len; T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + (normalized == false); @@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel { T target_box_center_x = 0, target_box_center_y = 0; T target_box_width = 0, target_box_height = 0; - T box_var_x = T(1), box_var_y = T(1); - T box_var_w = T(1), box_var_h = T(1); - if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - if (axis == 0) - prior_var_offset = j * len; - else if (axis == 1) - prior_var_offset = i * len; - } - box_var_x = prior_box_var_data[prior_var_offset]; - box_var_y = prior_box_var_data[prior_var_offset + 1]; - box_var_w = prior_box_var_data[prior_var_offset + 2]; - box_var_h = prior_box_var_data[prior_var_offset + 3]; - } else if (!(variance.empty())) { - box_var_x = static_cast(variance[0]); - box_var_y = static_cast(variance[1]); - box_var_w = static_cast(variance[2]); - box_var_h = static_cast(variance[3]); + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + 4 * sizeof(T)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast(variance.data()); } + T box_var_x = *var_ptr; + T box_var_y = *(var_ptr + 1); + T box_var_w = *(var_ptr + 2); + T box_var_h = *(var_ptr + 3); + target_box_center_x = box_var_x * target_box_data[offset] * prior_box_width + prior_box_center_x; @@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, variance, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, - variance, output); + if (prior_box_var) { + if (axis == 0) { + DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else if (!(variance.empty())) { + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index d04bb8f338a80946e8f1d945f66122f02f526eac..91e44152658d87750f0b6d5826c481904085e086 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { } }; +class ElementwiseGradOpInplace : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map ret; + if (block->HasVar(framework::GradVarName("X")) && + block->HasVar(framework::GradVarName("Out"))) { + ret[framework::GradVarName("Out")] = framework::GradVarName("X"); + } + return ret; + } +}; + } // namespace operators } // namespace paddle @@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { op_type##GradMaker, \ ::paddle::operators::ElementwiseOpInplace); \ REGISTER_OPERATOR(op_type##_grad, \ - ::paddle::operators::ElementwiseOpExplicitGrad) + ::paddle::operators::ElementwiseOpExplicitGrad, \ + ::paddle::operators::ElementwiseGradOpInplace) diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 6aa4c76b9ce7f52f5816ea136e04b32a7d2e8d44..44a2f37b66772425a835c26e94c37b500e8a5d19 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -146,7 +146,11 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); REGISTER_OP_CPU_KERNEL( - expand, ops::ExpandKernel); + expand, ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel); REGISTER_OP_CPU_KERNEL( expand_grad, - ops::ExpandGradKernel); + ops::ExpandGradKernel, + ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu index d95c9b61802b5fe7059e1c95a50776db5aa7ad93..50a506b294db14f0d170c60a0ed760dcf280ad60 100644 --- a/paddle/fluid/operators/expand_op.cu +++ b/paddle/fluid/operators/expand_op.cu @@ -15,7 +15,11 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - expand, ops::ExpandKernel); + expand, ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel); REGISTER_OP_CUDA_KERNEL( expand_grad, - ops::ExpandGradKernel); + ops::ExpandGradKernel, + ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 8aff9111412030265491289bbdb03cf688d59ad8..d51eb054a96d27f6ce87ba4b4e717f49dcd8a588 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -21,26 +21,17 @@ limitations under the License. */ namespace paddle { namespace operators { -template -using EigenVectorArrayMap = - Eigen::TensorMap>; - -template -using ConstEigenVectorArrayMap = - Eigen::TensorMap>; +template +struct Compare { + public: + bool operator()(const T a, const T b) { return (std::abs(a) < std::abs(b)); } +}; template struct FindAbsMaxFunctor { void operator()(const platform::CPUDeviceContext& ctx, const T* in, const int num, T* out) { - Eigen::DSizes idim(num); - Eigen::DSizes odim(1); - Eigen::TensorMap> in_e(in, idim); - Eigen::TensorMap> out_e(out, odim); - - out_e = in_e.abs().maximum(); + *out = *(std::max_element(in + 0, in + num, Compare())); } }; diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h index 68e66f9298c4eafabb55c20195d46fed800f4ec4..13d98577e21db9041686822f57cb4992e5ad71ec 100644 --- a/paddle/fluid/operators/jit/gen/act.h +++ b/paddle/fluid/operators/jit/gen/act.h @@ -63,7 +63,6 @@ class VActFunc : public JitCode { public: explicit VActFunc(size_t code_size, void* code_ptr) : JitCode(code_size, code_ptr) {} - virtual const char* name() const = 0; virtual void genCode() = 0; protected: @@ -269,7 +268,7 @@ class VActJitCode : public VActFunc { this->genCode(); } - const char* name() const override { + std::string name() const override { std::string base = "VActJitCode"; switch (type_) { case operand_type::RELU: @@ -293,7 +292,7 @@ class VActJitCode : public VActFunc { default: break; } - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index 66a97c1be503b0fa983f9a7ec3b61c986774f16b..70312bbe5e97fcf465ce13ef71e5acc9bab4874e 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -41,7 +41,7 @@ class VXXJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + std::string name() const override { std::string base = "VXXJitCode"; if (scalar_index_ == 1) { base += "_Scalar"; @@ -62,7 +62,7 @@ class VXXJitCode : public JitCode { } base += (with_relu_ ? "_Relu" : ""); base += "_D" + std::to_string(num_); - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h index a4d7222a3459d175fc5eaf5cdf0e7a1a610f8b0c..d91f828e6aa7673265a460524dfcad119758aa77 100644 --- a/paddle/fluid/operators/jit/gen/gru.h +++ b/paddle/fluid/operators/jit/gen/gru.h @@ -49,7 +49,7 @@ class GRUJitCode : public VActFunc { this->genCode(); } - const char* name() const override { + std::string name() const override { std::string base = "GRUJitCode"; if (id_ == 0) { base += "_H1"; @@ -81,7 +81,7 @@ class GRUJitCode : public VActFunc { }; AddTypeStr(act_gate_); AddTypeStr(act_cand_); - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h index d3bc94b63d3f962cd655367a2afe1a08582b06fa..28d213e5e48749f84405454a2708d9289b9d290c 100644 --- a/paddle/fluid/operators/jit/gen/hopv.h +++ b/paddle/fluid/operators/jit/gen/hopv.h @@ -35,14 +35,14 @@ class HOPVJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + std::string name() const override { std::string base = "VXXJitCode"; if (type_ == operand_type::MAX) { base += "_MAX"; } else { base += "_SUM"; } - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index c388109604bc57e8475e79a6c57eecb5bfebfb52..689df8b1cbb7a928c9f9175d28a8231b56e2e82e 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/platform/cpu_info.h" @@ -59,7 +60,7 @@ typedef enum { } operand_type; #define DECLARE_JIT_CODE(codename) \ - const char* name() const override { return #codename; } + std::string name() const override { return #codename; } class JitCode : public GenBase, public Xbyak::CodeGenerator { public: @@ -68,7 +69,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size), code_ptr) {} - virtual const char* name() const = 0; virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h index d4753bca23de91c74415d41c372cde1610712ef7..fa560b6230d7164be907f0172fb1d91860c05db2 100644 --- a/paddle/fluid/operators/jit/gen/lstm.h +++ b/paddle/fluid/operators/jit/gen/lstm.h @@ -53,7 +53,7 @@ class LSTMJitCode : public VActFunc { this->genCode(); } - const char* name() const override { + std::string name() const override { std::string base = "LSTMJitCode"; if (use_peephole_) { base += "_Peephole"; @@ -85,7 +85,7 @@ class LSTMJitCode : public VActFunc { AddTypeStr(act_gate_); AddTypeStr(act_cand_); AddTypeStr(act_cell_); - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h index 626baa8f738bf0395f3c7f1700610d0a9075879b..881cea581acc27a7aa7d395c041d40a4d3281947 100644 --- a/paddle/fluid/operators/jit/gen/matmul.h +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -36,11 +36,11 @@ class MatMulJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + std::string name() const override { std::string base = "MatMulJitCode"; base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + std::to_string(k_); - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index fcbbb3c84c562e2ba57110134bf07bb218b41edb..4108ee2f46433f6dc846cbdd3a8f8f9b15cc0c67 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + std::string name() const override { std::string base = "SeqPoolJitCode"; if (type_ == SeqPoolType::kSum) { base += "_Sum"; @@ -48,7 +48,7 @@ class SeqPoolJitCode : public JitCode { base += "_Sqrt"; } base += ("_W" + std::to_string(w_)); - return base.c_str(); + return base; } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 3cd5f6554bdc188ce9ea0c0b85c84d032c509600..f3603875ad7bda1fc688f9c053e0d37f7bb31f02 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -17,7 +17,13 @@ #include #include #include +#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#ifndef _WIN32 +#define posix_memalign_free free +#endif DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -40,6 +46,17 @@ void GenBase::dumpCode(const unsigned char* code) const { } } +void* GenBase::operator new(size_t size) { + void* ptr; + constexpr size_t alignment = 32ul; + PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, + "GenBase Alloc %ld error!", size); + PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); + return ptr; +} + +void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); } + std::vector packed_groups(int n, int k, int* block_out, int* rest_out) { int block; int max_num_regs; diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index d808a332472ae86240cb63356cb417123523366a..a7c7a35a7ea35bd80333b04f001d4ab5b5d1e06b 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -16,6 +16,7 @@ #include #include // for unique_ptr +#include #include #include "paddle/fluid/operators/jit/kernel_base.h" @@ -28,7 +29,7 @@ namespace jit { class GenBase : public Kernel { public: virtual ~GenBase() = default; - virtual const char* name() const = 0; + virtual std::string name() const = 0; virtual size_t getSize() const = 0; virtual const unsigned char* getCodeInternal() = 0; template @@ -42,6 +43,11 @@ class GenBase : public Kernel { return reinterpret_cast(const_cast(code)); } + void* operator new(size_t size); + void operator delete(void* ptr); + void* operator new[](size_t size) { return operator new(size); } + void operator delete[](void* ptr) { operator delete(ptr); } + protected: void dumpCode(const unsigned char* code) const; }; diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index a7d0fd4856edc74237151c64f286d468ad86e7ca..56c6e37ae3c62e1f9af66ef6ed16111dc1e93d9d 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -129,6 +129,7 @@ class LookupTableGradKernel : public framework::OpKernel { "must be either LoDTensor or SelectedRows"); } + int64_t padding_idx = context.Attr("padding_idx"); bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. @@ -187,10 +188,15 @@ class LookupTableGradKernel : public framework::OpKernel { memset(d_table_data, 0, d_table->numel() * sizeof(T)); for (int64_t i = 0; i < ids->numel(); ++i) { - PADDLE_ENFORCE_LT(ids_data[i], N); - PADDLE_ENFORCE_GE(ids_data[i], 0); - for (int j = 0; j < D; ++j) { - d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + if (padding_idx != kNoPadding && ids_data[i] == padding_idx) { + // the gradient of padding_idx should be 0, already done by memset, so + // do nothing. + } else { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } } } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index e20524012a5839fd250b7426a5efc42b7e87fe87..4b6eef18d8b967af5f3a5df0dee750620e7e412a 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -37,7 +37,7 @@ math_library(concat_and_split) math_library(context_project DEPS im2col math_function) math_library(cross_entropy) math_library(cos_sim_functor) -math_library(depthwise_conv) +math_library(depthwise_conv DEPS cub) math_library(im2col) math_library(sampler) diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index e595f1a627cfefbb91b070b898046cf135dc4988..3a926a716f54a094eba11d63c3b29de27dff274b 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -282,7 +282,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel { ? mkldnn::inner_product_backward_weights::desc( src, diff_weights, bias, diff_dst) : mkldnn::inner_product_backward_weights::desc( - src, diff_weights, bias, diff_dst); + src, diff_weights, diff_dst); return mkldnn::inner_product_backward_weights::primitive_desc( bwd_weight_desc, engine, pd); diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index 38e65524e870834710ff29f722c69eadf67d9dbe..08d72a5b3978097f4d3dca2e38bef2c3d89cfdc8 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -34,6 +34,8 @@ std::map}, + {"sum", NG_OPS::BuildSumNode}, {"relu", NG_OPS::BuildUnaryNode}, + {"relu_grad", NG_OPS::BuildReluGradNode}, {"tanh", NG_OPS::BuildUnaryNode}, + {"tanh_grad", NG_OPS::BuildTanhGradNode}, {"top_k", NG_OPS::BuildTopKNode}}; void NgraphBridge::BuildNgNode( diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h index d2974298b0707575624ad2f6935e83d06b4c83bb..2f194a9b8766316fc645f7e22e21fff048fb7d63 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::proto::VarType::FP32, ctx.GetPlace()); + framework::proto::VarType::FP32, platform::CPUPlace()); return kt; } }; diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index fb574f1bc1160c79f5802f11c00716eccad7f48d..c7d7392080cdc82f1d59314337192ad8ea5fa2d1 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -22,6 +22,8 @@ limitations under the License. */ #pragma once #include "ops/accuracy_op.h" +#include "ops/activation_op.h" +#include "ops/batch_norm_op.h" #include "ops/binary_unary_op.h" #include "ops/conv2d_op.h" #include "ops/elementwise_add_op.h" @@ -31,4 +33,5 @@ limitations under the License. */ #include "ops/pool2d_op.h" #include "ops/scale_op.h" #include "ops/softmax_op.h" +#include "ops/sum_op.h" #include "ops/top_k_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f66080e3aabc05d3ce5ecaa3791de4410e34fa37 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/activation_op.h @@ -0,0 +1,52 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildReluGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto out = platform::GetInputNode(op, "Out", ngb_node_map); + auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto relu_grad = std::make_shared(out, dout); + platform::SetOutputNode(op, "X@GRAD", relu_grad, ngb_node_map); +} + +void BuildTanhGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto out = platform::GetInputNode(op, "Out", ngb_node_map); + auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto shape = out->get_shape(); + auto node_const = + ngraph::op::Constant::create(ngraph::element::f32, shape, {1}); + auto result = dout * (node_const - out * out); + platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2cdd0299760dadc228fb9121585363b23652789a --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h @@ -0,0 +1,150 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildBatchNormNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto& data_layout = op_attrs.Get("data_layout"); + + auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map); + auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map); + auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map); + auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + + const bool is_test = op_attrs.Get("is_test"); + const float epsilon = op_attrs.Get("epsilon"); + const float momentum = op_attrs.Get("momentum"); + + if (data_layout == "NHWC") { + x = paddle::platform::Nhwc2Nchw(x); + } + + std::shared_ptr mean_out, saved_mean, saved_variance, + variance_out, y; + + if (!is_test) { + auto BN = std::make_shared(epsilon, scale, + bias, x); + y = std::make_shared(BN, 0); + saved_mean = std::make_shared(BN, 1); + saved_variance = std::make_shared(BN, 2); + + mean_out = std::make_shared( + paddle::operators::ngraphs::ElementwiseScalar( + momentum, mean), + paddle::operators::ngraphs::ElementwiseScalar( + 1. - momentum, saved_mean)); + variance_out = std::make_shared( + paddle::operators::ngraphs::ElementwiseScalar( + momentum, variance), + paddle::operators::ngraphs::ElementwiseScalar( + 1. - momentum, saved_variance)); + + if (data_layout == "NHWC") { + y = paddle::platform::Nchw2Nhwc(y); + } + + paddle::platform::SetOutputNode(op, "MeanOut", mean_out, ngb_node_map); + paddle::platform::SetOutputNode(op, "VarianceOut", variance_out, + ngb_node_map); + paddle::platform::SetOutputNode(op, "SavedMean", saved_mean, ngb_node_map); + paddle::platform::SetOutputNode(op, "SavedVariance", saved_variance, + ngb_node_map); + paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map); + } else { + y = std::make_shared(epsilon, scale, bias, + x, mean, variance); + paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map); + } +} + +void BuildBatchNormGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto& data_layout = op_attrs.Get("data_layout"); + + auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map); + auto saved_mean = + paddle::platform::GetInputNode(op, "SavedMean", ngb_node_map); + auto saved_variance = + paddle::platform::GetInputNode(op, "SavedVariance", ngb_node_map); + auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); + auto x_shape = x->get_shape(); + auto dy_shape = dy->get_shape(); + + PADDLE_ENFORCE(x_shape.size() == 2 || x_shape.size() == 4, + "BN grap input size needs to be 2 or 4"); + PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(), + "BN grap input and delta size needs to be equal"); + + if (x_shape.size() == 2) { + x = std::make_shared( + x, ngraph::AxisVector{0, 1}, + ngraph::Shape{x_shape.at(0), x_shape.at(1), 1, 1}); + dy = std::make_shared( + dy, ngraph::AxisVector{0, 1}, + ngraph::Shape{dy_shape.at(0), dy_shape.at(1), 1, 1}); + } + + if (data_layout == "NHWC") { + x = paddle::platform::Nhwc2Nchw(dy); + dy = paddle::platform::Nhwc2Nchw(dy); + } + const float epsilon = op_attrs.Get("epsilon"); + + auto bn_bprop = std::make_shared( + epsilon, scale, bias, x, saved_mean, saved_variance, dy); + + std::shared_ptr dx = + std::make_shared(bn_bprop, 0); + auto dscale = std::make_shared(bn_bprop, 1); + auto dbias = std::make_shared(bn_bprop, 2); + paddle::platform::SetOutputNode(op, "Bias@GRAD", dbias, ngb_node_map); + paddle::platform::SetOutputNode(op, "Scale@GRAD", dscale, ngb_node_map); + if (x_shape.size() == 2) { + paddle::platform::SetOutputNode( + op, "X@GRAD", paddle::platform::NgReshaper(dx, x_shape), ngb_node_map); + } else { + if (data_layout == "NHWC") { + dx = paddle::platform::Nchw2Nhwc(dx); + } + paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map); + } +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/sum_op.h b/paddle/fluid/operators/ngraph/ops/sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97f4ce64aa58bfa8cb70c36f9a12b7b8135da637 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/sum_op.h @@ -0,0 +1,55 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildSumNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + std::vector op_inputs; + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + op_inputs.push_back(var_name); + if (ngb_node_map->find(var_name) == ngb_node_map->end()) { + PADDLE_THROW("op % input varname %s is not found in var_node_map", + op->Type(), var_name); + } + } + } + std::shared_ptr& sum = ngb_node_map->at(op_inputs[0]); + for (size_t k = 1; k < op_inputs.size(); ++k) { + std::shared_ptr& nodek = ngb_node_map->at(op_inputs[k]); + if (nodek->get_element_type() != sum->get_element_type()) { + nodek = + std::make_shared(nodek, sum->get_element_type()); + } + sum = sum + nodek; + } + platform::SetOutputNode(op, "Out", sum, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 5399ae556e7f38a551d680704d8d825e2fdba88a..fc3636e0b24765f681d3260b07fe854309774a40 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -259,7 +259,7 @@ Example: W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ hstart = i * strides[0] - paddings[0] hend = hstart + ksize[0] @@ -267,7 +267,7 @@ Example: wend = wstart + ksize[1] Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} $$ - For exclusive = false: + For exclusive = true: $$ hstart = max(0, i * strides[0] - paddings[0]) hend = min(H, hstart + ksize[0]) @@ -403,7 +403,7 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ dstart = i * strides[0] - paddings[0] dend = dstart + ksize[0] @@ -413,7 +413,7 @@ Example: wend = wstart + ksize[2] Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} $$ - For exclusive = false: + For exclusive = true: $$ dstart = max(0, i * strides[0] - paddings[0]) dend = min(D, dstart + ksize[0]) diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index d68ba9d661698bb0d33b139f5748daec2ead6595..ee034b270527376fc268b8a868f90db52c51848a 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -121,7 +121,7 @@ struct RandomCropFunctor { HOSTDEVICE void operator()(size_t ins_idx) { typename Random::Engine engine(seed_); engine.discard(ins_idx * (rank_ - num_batchsize_dims_)); - size_t offsets[9]; + size_t offsets[9] = {}; for (int i = num_batchsize_dims_; i < rank_; ++i) { typename Random::template UniformIntDist dist( 0, x_dims_[i] - out_dims_[i]); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa0768bd2bcc9e6485a32485f0212ac6..defc29b91f81cb851fec24c5cd9d62dc72c54147 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/reader/buffered_reader.h" #include +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace operators { @@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() { position_.front().wait(); position_.pop(); } +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaStreamDestroy(stream)); + for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event)); + } +#endif } BufferedReader::BufferedReader( @@ -33,6 +41,19 @@ BufferedReader::BufferedReader( thread_pool_(1), place_(place), buffer_size_(buffer_size) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + compute_stream = + ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events.resize(buffer_size); + for (auto &event : events) + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } +#endif cpu_buffer_.resize(buffer_size); gpu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); @@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() { } void BufferedReader::ReadAsync(size_t i) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream)); + } +#endif position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { TensorVec &cpu = cpu_buffer_[i]; reader_->ReadNext(&cpu); @@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } +#ifdef PADDLE_WITH_CUDA + // NOTE(liangdun): using async copy instead of TensorCopySync + // TensorCopySync would block other stream if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - framework::TensorCopySync(cpu[i], place_, &gpu[i]); + gpu[i].Resize(cpu[i].dims()); + gpu[i].set_layout(cpu[i].layout()); + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); + auto size = + cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); + if (platform::is_cuda_pinned_place(cpu_place)) + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), + cpu_ptr, size, stream); + else if ((platform::is_gpu_place(cpu_place))) + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), cpu_ptr, + size, stream); + else + // if cpu place is not pinned, async copy is slower than sync copy, + // so we use sync copy instead. + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), cpu_ptr, size, + 0); gpu[i].set_lod(cpu[i].lod()); } + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); } +#endif return i; })); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index cbe2bc1b5fdd69d1a843b768e3289acd621369a6..87680da01a1f51cfdfe4d100508440eda9d1877f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -19,6 +19,9 @@ #include #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif namespace paddle { namespace operators { @@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector gpu_buffer_; size_t prev_pos_{-1UL}; +#ifdef PADDLE_WITH_CUDA + cudaStream_t stream; + cudaStream_t compute_stream; + std::vector events; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index f08798794a2f9fc042800583cbc032d6f12bf3dc..43a49de52242b96aade91013e89228fcb3247302 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, framework::LoD lod{lod_data}; lod_tensor.set_lod(lod); int64_t* tensor_data = lod_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_feasign.size())}), + framework::make_ddim({static_cast(batch_feasign.size()), 1}), platform::CPUPlace()); memcpy(tensor_data, batch_feasign.data(), batch_feasign.size() * sizeof(int64_t)); @@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, // insert label tensor framework::LoDTensor label_tensor; auto* label_tensor_data = label_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_label.size())}), + framework::make_ddim({static_cast(batch_label.size()), 1}), platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), batch_label.size() * sizeof(int64_t)); diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 9f3a254c84d4e04fbcd449644a7e138eff520fbc..6410439816d8ae4a9d1df507819071ce76b5308e 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { std::vector>> data_slot_6003{b1, b2, b3, b4}; - std::vector label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; + std::vector label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}}; LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 5fe4d15ae2c6254a50318813c852b6c314880aba..ebd07d90ebe6b0ba008ac89c01c4f054f96a6da9 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,5 +1,9 @@ include(operators) -register_operators() +if(WITH_GPU) + register_operators(DEPS cub) +else() + register_operators() +endif() if(WITH_GPU) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1f51b5bab3068cc89bffa85de28a9438359659f3..fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,4 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce boost) +cc_library(place SRCS place.cc DEPS enforce boost lib_any) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h index b84315995a9d8a65668f57eef67f6dab8c20f9b3..5ee985ea719f8cb28bf8be23823eb6c96f4af1a3 100644 --- a/paddle/fluid/platform/ngraph_helper.h +++ b/paddle/fluid/platform/ngraph_helper.h @@ -23,6 +23,26 @@ limitations under the License. */ namespace paddle { namespace platform { +std::shared_ptr Nhwc2Nchw(std::shared_ptr in) { + auto in_shape = in->get_shape(); + in_shape[0] = in->get_shape()[0]; + in_shape[1] = in->get_shape()[3]; + in_shape[2] = in->get_shape()[1]; + in_shape[3] = in->get_shape()[2]; + ngraph::AxisVector axis_vec = {0, 3, 1, 2}; + return std::make_shared(in, axis_vec, in_shape); +} + +std::shared_ptr Nchw2Nhwc(std::shared_ptr in) { + auto in_shape = in->get_shape(); + in_shape[0] = in->get_shape()[0]; + in_shape[1] = in->get_shape()[2]; + in_shape[2] = in->get_shape()[3]; + in_shape[3] = in->get_shape()[1]; + ngraph::AxisVector axis_vec = {0, 2, 3, 1}; + return std::make_shared(in, axis_vec, in_shape); +} + ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) { auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1, std::multiplies()); diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 655ce8485d4584aa0955315b045da6bf541f7fe2..60b2d83f15746eab0a4d29c7965c064690b6d46d 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,6 +14,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); + namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 803ea6b26087884ad79c6bf80238953a012eaddc..4ac5b83c56b114f4e3e4c78710716adc636ebe1d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -26,5 +26,5 @@ if(WITH_PYTHON) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) - cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 39e47be606c07ed216c9fe2ff8fa75552b8b7c76..7db2bb451b49918fd8d92a6036c132d34e965c63 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -74,12 +74,12 @@ void BindPaddleBuf(py::module *m) { .def(py::init([](std::vector &data) { auto buf = PaddleBuf(data.size() * sizeof(float)); std::memcpy(buf.data(), static_cast(data.data()), buf.length()); - return std::move(buf); + return buf; })) .def(py::init([](std::vector &data) { auto buf = PaddleBuf(data.size() * sizeof(int64_t)); std::memcpy(buf.data(), static_cast(data.data()), buf.length()); - return std::move(buf); + return buf; })) .def("resize", &PaddleBuf::Resize) .def("reset", diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6549229e05de5f2a809b56775d9788bbf8e5c1ae..351513712cc4297bf7fbe67878aeba162ef66e4d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -295,6 +295,7 @@ PYBIND11_MODULE(core, m) { .def("_get_float_element", TensorGetElement) .def("_set_double_element", TensorSetElement) .def("_get_double_element", TensorGetElement) + .def("_place", [](Tensor &self) { return self.place(); }) .def("_dtype", [](Tensor &self) { return self.type(); }); py::class_(m, "LoDTensor", R"DOC( @@ -673,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Place") .def(py::init<>()) + .def("is_gpu_place", + [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("gpu_device_id", + [](platform::Place &self) { + return boost::get(self).device; + }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; @@ -1092,10 +1099,6 @@ All parameter, weight, gradient are variables in Paddle. "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) - .def_property( - "memory_early_delete", - [](const BuildStrategy &self) { return self.memory_early_delete_; }, - [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) .def_property( "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 59e695e6fcb66cbaed1bcc9e861df81b5f73c1ed..90b8fd1a0aab159eb1a829d67485c845182d295b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -54,7 +54,7 @@ ELSE(WIN32) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ENDIF() -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 53746afdb25b34b69f89fe0927c877ace62d7d55..fe2ae67ec606b9e8bc936143d246f9a804684e03 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -25,4 +25,5 @@ import paddle.reader import paddle.dataset import paddle.batch import paddle.compat +import paddle.distributed batch = batch.batch diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/python/paddle/distributed/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/run_mp.py b/python/paddle/distributed/launch.py similarity index 83% rename from tools/run_mp.py rename to python/paddle/distributed/launch.py index 2485400ab818ea51ae2608921b63890c7066baf0..03c4078775d455fdb19aaf78ace4dcb98c8dd66a 100644 --- a/tools/run_mp.py +++ b/python/paddle/distributed/launch.py @@ -37,7 +37,7 @@ default_envs = { GPUS = 8 -def start_procs(gpus, cmd, log_dir): +def start_procs(gpus, entrypoint, entrypoint_args, log_dir): procs = [] log_fns = [] os.system("mkdir -p %s" % log_dir) @@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir): "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints }) - print("starting process ", i, cmd, curr_env) + print("starting process ", i, entrypoint, entrypoint_args, curr_env) fn = open("%s/workerlog.%d" % (log_dir, i), "w") log_fns.append(fn) - procs.append( - subprocess.Popen( - cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env)) + cmd = [sys.executable, "-u", entrypoint] + entrypoint_args + procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env)) for i in range(gpus): try: @@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir): pass -def main(): +def parse_args(): + parser = argparse.ArgumentParser( description='''start paddle training using multi-process mode. NOTE: your train program ***must*** run as distributed nccl2 mode, @@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training) type=int, default=8, help='start number of processes for every gpu') - parser.add_argument( - '--cmd', - type=str, - default="", - help='command to run for each process, e.g. python train.py --lr 0.1') parser.add_argument( '--log_dir', type=str, default="mylog", help='directory to put logs per process.') - args = parser.parse_args() - if args.cmd == "": - parser.print_help() - exit(0) - start_procs(args.gpus, args.cmd, args.log_dir) + parser.add_argument( + 'entrypoint_script', + type=str, + help="The entrypoint script to be launched in parallel," + "followed by all the arguments for each process," + "e.g. train.py --lr 0.1") + parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER) + return parser.parse_args() + + +def main(): + args = parse_args() + + # launch multiple training process + start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args, + args.log_dir) if __name__ == "__main__": diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 396f36e188b27fe450cc19b3b8ccf967daf1456c..aa1f85734df40a53200efe74e5904d6ccc53e072 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -161,7 +161,6 @@ def __bootstrap__(): 'times_excess_than_required_tmp_allocation', 'enable_inplace_whitelist' ] - core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py index f2b7ac8375af25beed562b8279b6044f11c09d44..5854cadb58c76066ba4b48dc6b5dbca06fba8cba 100644 --- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py +++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py @@ -22,7 +22,7 @@ This API is still under active development and may change drastically. from __future__ import print_function -import contextlib +from ...wrapped_decorator import signature_safe_contextmanager import numpy as np import six @@ -419,7 +419,7 @@ class TrainingDecoder(object): self._state_cell = state_cell self._state_cell._enter_decoder(self) - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): """ Define the behavior of the decoder for each RNN time step. @@ -613,7 +613,7 @@ class BeamSearchDecoder(object): self._word_dim = word_dim self._input_var_dict = input_var_dict - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): """ Define the behavior of the decoder for each RNN time step. diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py index b8d5f4ffeadca0a7b103682f175d50dc46fa258a..4f37129234482189436ad71391f55394e2b8a277 100644 --- a/python/paddle/fluid/contrib/inferencer.py +++ b/python/paddle/fluid/contrib/inferencer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager from .. import core @@ -105,7 +105,7 @@ class Inferencer(object): return results - @contextlib.contextmanager + @signature_safe_contextmanager def _prog_and_scope_guard(self): with framework.program_guard(main_program=self.inference_program): with executor.scope_guard(self.scope): diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py index 8569e486f91786b5562e84dcdccf6d91da0612cc..d27b808438d53a004db4e85345a68c35d00fff98 100644 --- a/python/paddle/fluid/contrib/trainer.py +++ b/python/paddle/fluid/contrib/trainer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager import os import errno import shutil @@ -453,7 +453,7 @@ class Trainer(object): io.save_inference_model(param_path, feeded_var_names, target_vars, exe) - @contextlib.contextmanager + @signature_safe_contextmanager def _prog_and_scope_guard(self): with framework.program_guard( main_program=self.train_program, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index d3ff14a17955990bff851e95bd61fbc370ea7aa5..8815911eaeb36067987c0490d7a4f3e909789499 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -17,7 +17,7 @@ from __future__ import print_function import os import multiprocessing import numpy as np -import contextlib +from .wrapped_decorator import signature_safe_contextmanager import six from .framework import Program, default_main_program, Variable from . import core @@ -49,7 +49,7 @@ def _switch_scope(scope): return ex -@contextlib.contextmanager +@signature_safe_contextmanager def scope_guard(scope): """ Change the global/default scope instance by Python `with` statement. All diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index c0b0ad8a202b82183de9ec1edd43cb10db10fb5c..832c97c7deb49b4e118e15989ab7a34da6ce57a0 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -16,7 +16,7 @@ from __future__ import print_function import collections from collections import defaultdict -import contextlib +from .wrapped_decorator import signature_safe_contextmanager import os import re import traceback @@ -111,7 +111,7 @@ class NameScope(object): _name_scope = NameScope() -@contextlib.contextmanager +@signature_safe_contextmanager def name_scope(prefix=None): """ Generate hierarchical name prefix for the operators. @@ -1775,7 +1775,7 @@ class Program(object): def set_op_role_var(self, var_name): self._op_role_var = [var_name] - @contextlib.contextmanager + @signature_safe_contextmanager def _optimized_guard(self, param_and_grads): """ A with guard to set :code:`Optimization` :code:`OpRole` and @@ -1805,7 +1805,7 @@ class Program(object): self._op_role_var = tmp_var self._current_role = tmp_role - @contextlib.contextmanager + @signature_safe_contextmanager def _lr_schedule_guard(self, is_with_opt=False): """ A with guard to set :code:`LRSched` :code:`OpRole` and @@ -2459,7 +2459,7 @@ def switch_startup_program(program): return prev_program -@contextlib.contextmanager +@signature_safe_contextmanager def program_guard(main_program, startup_program=None): """ Change the global main program and startup program with `with` statement. @@ -2524,7 +2524,7 @@ def _get_var(name, program=None): return program.global_block().var(name) -@contextlib.contextmanager +@signature_safe_contextmanager def _imperative_guard(tracer): global _imperative_tracer_ tmp_trace = _imperative_tracer_ @@ -2535,7 +2535,7 @@ def _imperative_guard(tracer): _imperative_tracer_ = tmp_trace -@contextlib.contextmanager +@signature_safe_contextmanager def _imperative_place_guard(place): global _imperative_current_expected_place_ tmp_place = _imperative_current_expected_place_ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index ff3984b11f42cf9e6ff49c8654c600c065effe1d..d4525233cc681720404770ef1d0c5d3006607a2e 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager import numpy as np from paddle.fluid import core @@ -24,7 +24,7 @@ def enabled(): return framework._in_imperative_mode() -@contextlib.contextmanager +@signature_safe_contextmanager def guard(place=None): train = framework.Program() startup = framework.Program() diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 5be21ff7f7270f6ce950c069f61418c922bcedc5..e8341be28683a25971a53a37c70533a16add1593 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -16,7 +16,7 @@ from __future__ import print_function from . import framework import numpy as np -import contextlib +from .wrapped_decorator import signature_safe_contextmanager from .core import VarDesc from . import unique_name @@ -49,7 +49,7 @@ def force_init_on_cpu(): return _force_init_on_cpu_ -@contextlib.contextmanager +@signature_safe_contextmanager def init_on_cpu(): """ Force the variable to be inited on CPU. diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index a172141b3a0455769dc1ce74d098be057324e047..7d1636774c6e27ec8090ac01710e23beed5fd0e8 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -302,7 +302,8 @@ class LayerHelper(object): if default_initializer is None and attr.initializer is None: if isinstance(dtype, core.VarDesc.VarType): if dtype != core.VarDesc.VarType.FP32 and \ - dtype != core.VarDesc.VarType.FP64: + dtype != core.VarDesc.VarType.FP64 and \ + dtype != core.VarDesc.VarType.FP16: raise TypeError( "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" ) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index a7494aaceab42332cb4362ab1df43d9e0b139f4f..3a6753b01f152f61b78a9f04f4fe32136c051a19 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager from .layer_function_generator import autodoc, templatedoc from .tensor import assign, fill_constant @@ -1532,7 +1532,7 @@ class DynamicRNN(object): outputs={'Out': [x_reordered]}) return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table) - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): """ The block for user to define operators in RNN. See the class docstring diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index c983e2a44b25c5943df5e822e2e363b2557a6ac3..3b43ae0b9cb63a9f4708a680cb1021d74c197550 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -397,10 +397,10 @@ def box_coder(prior_box, input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var(Variable|list): prior_box_var supports two types of input. - One is variable with shape [M, 4] holds M group. - The other one is list consist of 4 elements - shared by all boxes. + prior_box_var(Variable|list|None): prior_box_var supports two types + of input. One is variable with shape [M, 4] + holds M group. The other one is list consist of + 4 elements shared by all boxes. target_box(Variable): This input can be a 2-D LoDTensor with shape [N, 4] when code_type is 'encode_center_size'. This input also can be a 3-D Tensor with shape diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 1762bd3e343e8af6768dd23f8fbc58cd0182d3c9..b88be66906e806aeee55c1af6235a6fef9da7030 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager import multiprocessing import os import six @@ -1116,7 +1116,7 @@ class Preprocessor(object): def _is_completed(self): return self.sub_block and self.source_var_names and self.sink_var_names - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): self.status = Preprocessor.IN_SUB_BLOCK self.sub_block = self.main_prog._create_block() diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0e4b5aadc0b0d7e87ea1cfb8e18339fe211e1eef..46ce58fd2db19205e4d1194c30cba76a83fa2de8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2930,6 +2930,7 @@ def batch_norm(input, "momentum": momentum, "epsilon": epsilon, "is_test": is_test, + "data_layout": data_layout, "use_mkldnn": False, "fuse_with_relu": fuse_with_relu, "use_global_stats": use_global_stats diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e0e781a322b3eb68e3f54a66252a8d8b11a9a56f..fbd04f1eb461268ae98ca74c0bc46cc2717733cb 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -15,7 +15,7 @@ from __future__ import print_function from collections import defaultdict -from contextlib import contextmanager +from .wrapped_decorator import signature_safe_contextmanager from paddle.fluid.framework import Program, Variable, name_scope, default_main_program from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table @@ -1610,7 +1610,7 @@ class ModelAverage(Optimizer): }, stop_gradient=True) - @contextmanager + @signature_safe_contextmanager def apply(self, executor, need_restore=True): """Apply average values to parameters of current model. """ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 52b260efd15066a114a8146106685043654c91ea..22212ae9a216acaab3f295f1f8d091829a0aa471 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,8 @@ class ParallelExecutor(object): else framework.default_main_program() # FIXME(dzhwinter): enable_inplace should be after memory_optimize # if turn on python memory optimize, turn off the inplace_pass. - build_strategy.enable_inplace = False if main._is_mem_optimized else True + if build_strategy.enable_inplace is None: + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from, diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e05885f5f5bfc169828c1c6e723dffff098c3c2e..d5670dbc823c5d317f27f768c596ed2e009e71b6 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -15,7 +15,7 @@ from __future__ import print_function from . import core -from contextlib import contextmanager +from .wrapped_decorator import signature_safe_contextmanager import os import six @@ -35,7 +35,7 @@ NVPROF_CONFIG = [ ] -@contextmanager +@signature_safe_contextmanager def cuda_profiler(output_file, output_mode=None, config=None): """The CUDA profiler. This fuctions is used to profile CUDA program by CUDA runtime application @@ -217,7 +217,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): core.disable_profiler(key_map[sorted_key], profile_path) -@contextmanager +@signature_safe_contextmanager def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """The profiler interface. Different from cuda_profiler, this profiler can be used to profile both CPU diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 076a942cdde5623faa570bf98f889e8145b60f8b..aa581f23a191639fdc026e7781897d5d996823a9 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -15,14 +15,14 @@ from __future__ import print_function import os -import contextlib +from .wrapped_decorator import signature_safe_contextmanager from . import core __all__ = [ 'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files' ] -@contextlib.contextmanager +@signature_safe_contextmanager def create_recordio_writer(filename, compressor=core.RecordIOWriter.Compressor.Snappy, max_num_records=1000): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4b26bacce968a6da72e9aa043adb38918b293a35..534411219b500723f3799a08fdf1b7796534376b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -109,11 +109,12 @@ set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + # change the timeout from 600 to 1200, because in debug mode, this test need more time. + set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200) + endif() endif() -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - # change the timeout from 600 to 900, because in debug mode, this test need more time. - set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900) -endif() + if (WITH_NGRAPH) add_subdirectory(ngraph) diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py index 13a33e20478372af370d38ab2b475e4425dc8d6e..84b9198dbf6569b7dbd7bd3c953d5254ece178e8 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py @@ -16,14 +16,37 @@ from __future__ import print_function import unittest import numpy as np -import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest -from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp -class TestNGRAPHAccuracyOp(TestAccuracyOp): +class TestNGRAPHAccuracyOp(OpTest): def setUp(self): - super(TestNGRAPHAccuracyOp, self).setUp() + self.op_type = "accuracy" + self.dtype = np.float32 + self.init_dtype() + n = 128 + infer = np.random.random((n, 1)).astype(self.dtype) + indices = np.random.randint(0, 2, (n, 1)) + label = np.random.randint(0, 2, (n, 1)) + self.inputs = {'Out': infer, 'Indices': indices, "Label": label} + num_correct = 0 + for rowid in range(n): + for ele in indices[rowid]: + if ele == label[rowid]: + num_correct += 1 + break + self.outputs = { + 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype), + 'Correct': np.array([num_correct]).astype("int64"), + 'Total': np.array([n]).astype("int64") + } + self._cpu_only = True + + def init_dtype(self): + pass + + def test_check_output(self): + self.check_output() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py index 2bd9bf843039573862a22c85557d416bf82b41f6..034d7792c13efb432e6bef6c95ee554584f29519 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py @@ -18,17 +18,7 @@ import unittest import numpy as np import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest -from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh - - -class TestNGRAPHReluDim2(TestRelu): - def setUp(self): - super(TestNGRAPHReluDim2, self).setUp() - - -class TestNGRAPHTanhDim2(TestTanh): - def setUp(self): - super(TestNGRAPHTanhDim2, self).setUp() +from paddle.fluid.tests.unittests.test_activation_op import TestSigmoid, TestRelu, TestTanh class TestNGRAPHReluDim4(TestRelu): diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..511173af5e5b2a1d1e50d199b55e7d9ace6584f4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py @@ -0,0 +1,37 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference + + +class TestNGRAPHBatchNormOpTraining(TestBatchNormOpTraining): + def init_kernel_type(self): + super(TestNGRAPHBatchNormOpTraining, self).init_kernel_type() + + +class TestNGRAPHBatchNormOpInference(TestBatchNormOpInference): + def init_kernel_type(self): + super(TestNGRAPHBatchNormOpInference, self).init_kernel_type() + + +class TestNGRAPHBatchNormOpWithReluInference(TestBatchNormOpInference): + def init_kernel_type(self): + super(TestNGRAPHBatchNormOpWithReluInference, self).init_kernel_type() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py index e5424e8a6e615820b4a1a5f2ee7e7e87dd0b22af..dbc8557b4e1c96c13c5a189b44bed4b5f1aabf4f 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py @@ -15,35 +15,59 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_conv2d_op import * +from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 class TestNGRAPH(TestConv2dOp): + def setUp(self): + super(TestNGRAPH, self).setUp() + self._cpu_only = True + def init_kernel_type(self): super(TestNGRAPH, self).init_kernel_type() class TestNGRAPHWithPad(TestWithPad): + def setUp(self): + super(TestNGRAPHWithPad, self).setUp() + self._cpu_only = True + def init_kernel_type(self): super(TestNGRAPHWithPad, self).init_kernel_type() class TestNGRAPHWithStride(TestWithStride): + def setUp(self): + super(TestNGRAPHWithStride, self).setUp() + self._cpu_only = True + def init_kernel_type(self): super(TestNGRAPHWithStride, self).init_kernel_type() class TestNGRAPHWithGroup(TestWithGroup): + def setUp(self): + super(TestNGRAPHWithGroup, self).setUp() + self._cpu_only = True + def init_kernel_type(self): super(TestNGRAPHWithGroup, self).init_kernel_type() class TestNGRAPHWith1x1(TestWith1x1): + def setUp(self): + super(TestNGRAPHWith1x1, self).setUp() + self._cpu_only = True + def init_kernel_type(self): super(TestNGRAPHWith1x1, self).init_kernel_type() class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def setUp(self): + super(TestNGRAPHWithInput1x1Filter1x1, self).setUp() + self._cpu_only = True + def init_kernel_type(self): super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py index 67722db89bc9007c6247b8fc108f6df177157b7d..67f749bfeeb1bb47a8c5bb3486ac3292c8af8164 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py @@ -14,73 +14,16 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_elementwise_add_op import * +from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_scalar(TestElementwiseAddOp_scalar): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_scalar, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_scalar2, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_Vector(TestElementwiseAddOp_Vector): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_Vector, self).init_input_output() - - -class TesNGRAPHtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0): - def init_input_output(self): - super(TesNGRAPHtElementwiseAddOp_broadcast_0, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_1, self).init_input_output() + def setUp(self): + super(TestNGRAPHElementwiseAddOp, self).setUp() + self._cpu_only = True - -class TestNGRAPHElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2): def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_2, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_3, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_4, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_rowwise_add_0( - TestElementwiseAddOp_rowwise_add_0): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_rowwise_add_0, - self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_rowwise_add_1( - TestElementwiseAddOp_rowwise_add_1): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_rowwise_add_1, - self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_channelwise_add( - TestElementwiseAddOp_channelwise_add): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_channelwise_add, - self).init_input_output() + super(TestNGRAPHElementwiseAddOp, self).init_input_output() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py index 5535427ea8a93fdc5818cdc058aedb6fe72165ee..11881ac6e5292ce2beea1b353c6ca857ada28839 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py @@ -14,17 +14,13 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp +from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp class TestNGRAPHMeanOp(TestMeanOp): def setUp(self): super(TestNGRAPHMeanOp, self).setUp() - - -class TestNGRAPHFP16MeanOp(TestFP16MeanOp): - def setUp(self): - super(TestNGRAPHFP16MeanOp, self).setUp() + self._cpu_only = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py index 6aba62f7c08e3fe646372c851622f2e321b3aee2..a916c8d450f4a218c85f39c31737b2efa0ef926d 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py @@ -15,27 +15,38 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2 +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest + + +class TestNGRAPHMulOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.dtype = np.float32 + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((2, 4)).astype(self.dtype), + 'Y': np.random.random((4, 4)).astype(self.dtype) + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + self._cpu_only = True - -class TestNGRAPHMulOp(TestMulOp): def init_dtype_type(self): pass + def test_check_output(self): + self.check_output() -class TestNGRAPHMulOp2(TestMulOp2): - def init_dtype_type(self): - pass + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) -class TestNGRAPHFP16MulOp1(TestFP16MulOp1): - def init_dtype_type(self): - pass - - -class TestNGRAPHFP16MulOp2(TestFP16MulOp2): - def init_dtype_type(self): - pass + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py index 95e592e8ec036ad231ed57ddbc706683cb7aa153..96a2b72d8add9cc765a8536932b72426c8489025 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py @@ -14,35 +14,59 @@ from __future__ import print_function -from paddle.fluid.tests.unittests.test_pool2d_op import * +from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 class TestNGRAPHPool2D_Op(TestPool2D_Op): + def setUp(self): + super(TestNGRAPHPool2D_Op, self).setUp() + self._cpu_only = True + def init_test_case(self): super(TestNGRAPHPool2D_Op, self).init_test_case() class TestNGRAPHCase1(TestCase1): + def setUp(self): + super(TestNGRAPHCase1, self).setUp() + self._cpu_only = True + def init_test_case(self): super(TestNGRAPHCase1, self).init_test_case() class TestNGRAPHCase2(TestCase2): + def setUp(self): + super(TestNGRAPHCase2, self).setUp() + self._cpu_only = True + def init_test_case(self): super(TestNGRAPHCase2, self).init_test_case() class TestNGRAPHCase3(TestCase3): + def setUp(self): + super(TestNGRAPHCase3, self).setUp() + self._cpu_only = True + def init_pool_type(self): super(TestNGRAPHCase3, self).init_pool_type() class TestNGRAPHCase4(TestCase4): + def setUp(self): + super(TestNGRAPHCase4, self).setUp() + self._cpu_only = True + def init_pool_type(self): super(TestNGRAPHCase4, self).init_pool_type() class TestNGRAPHCase5(TestCase5): + def setUp(self): + super(TestNGRAPHCase5, self).setUp() + self._cpu_only = True + def init_pool_type(self): super(TestNGRAPHCase5, self).init_pool_type() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py index b42a1f73fa72b0dab936a3bb61a8893978b229ec..4da5ca4583c65d69ead8d3e9886605a7ad104cc0 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py @@ -13,25 +13,23 @@ # limitations under the License. from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows +from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows class TestNGRAPHScaleOp(TestScaleOp): - def init_dtype_type(self): - pass + def setUp(self): + super(TestNGRAPHScaleOp, self).setUp() + self._cpu_only = True - -class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): def init_dtype_type(self): pass -class TestNGRAPHScaleFp16Op(TestScaleFp16Op): - def init_dtype_type(self): - pass - +class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): + def setUp(self): + super(TestNGRAPHScaleOpSelectedRows, self).setUp() + self._cpu_only = True -class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows): def init_dtype_type(self): pass diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ed9fb618024301818a12fd0d02b09c6f3a5f2c53 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py @@ -0,0 +1,19 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function +import unittest +from paddle.fluid.tests.unittests.test_sum_op import TestSumOp, TestSelectedRowsSumOp, TestLoDTensorAndSelectedRowsOp + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py index 3a0171087dce5d4c7b72eca7f7e4fb955af94812..fa68df1adf2cfb31c63b70098a6fedc2ab3913aa 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py @@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, class TestNGRAPHTopkOp(TestTopkOp): def setUp(self): super(TestNGRAPHTopkOp, self).setUp() + self._cpu_only = True class TestNGRAPHTopkOp2(TestTopkOp2): def setUp(self): super(TestNGRAPHTopkOp2, self).setUp() + self._cpu_only = True class TestNGRAPHTopkOp3(TestTopkOp3): def setUp(self): super(TestNGRAPHTopkOp3, self).setUp() + self._cpu_only = True class TestNGRAPHTopkOp4(TestTopkOp4): def setUp(self): super(TestNGRAPHTopkOp4, self).setUp() + self._cpu_only = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 6156268bf25ada310a3d22242ecff4b9cdf1759a..220bffebe83925c60af65aa9594ddd8a29c38145 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): pb_y = pb_y.reshape(shape) if pb_v.ndim == 2: - pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) if pb_v.ndim == 1: tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y @@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest): self.outputs = {'OutputBox': output_box} -class TestBoxCoderOpWithOneRankVar(OpTest): - def test_check_output(self): - self.check_output() - - def setUp(self): - self.op_type = "box_coder" - lod = [[1, 1, 1, 1, 1]] - prior_box = np.random.random((81, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') - target_box = np.random.random((20, 81, 4)).astype('float32') - code_type = "DecodeCenterSize" - box_normalized = False - output_box = batch_box_coder(prior_box, prior_box_var, target_box, - lod[0], code_type, box_normalized) - - self.inputs = { - 'PriorBox': prior_box, - 'PriorBoxVar': prior_box_var, - 'TargetBox': target_box, - } - self.attrs = { - 'code_type': 'decode_center_size', - 'box_normalized': False - } - self.outputs = {'OutputBox': output_box} - - class TestBoxCoderOpWithoutBoxVar(OpTest): def test_check_output(self): self.check_output() @@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest): self.op_type = "box_coder" lod = [[1, 1, 1, 1, 1]] prior_box = np.random.random((30, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') + prior_box_var = np.random.random((30, 4)).astype('float32') target_box = np.random.random((30, 81, 4)).astype('float32') code_type = "DecodeCenterSize" box_normalized = False diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py index 67a8d8f0721c2c75b432d68d64be8fc1035ffc74..690875662e666aab63ac5eb62df0fb52823b8dff 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_op.py @@ -109,5 +109,32 @@ class TestExpandOpRank4(OpTest): self.check_grad(['X'], 'Out') +class TestExpandOpInteger(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = { + 'X': np.random.randint( + 10, size=(2, 4, 5)).astype("int32") + } + self.attrs = {'expand_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + +class TestExpandOpBoolean(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")} + self.attrs = {'expand_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py index b9957a699e597898bee75ce0e7283f7224293f0c..324257c13ff9828b341ca9affe8186387688c0bf 100644 --- a/python/paddle/fluid/unique_name.py +++ b/python/paddle/fluid/unique_name.py @@ -15,7 +15,7 @@ from __future__ import print_function import collections -import contextlib +from .wrapped_decorator import signature_safe_contextmanager import six import sys @@ -68,7 +68,7 @@ def switch(new_generator=None): return old -@contextlib.contextmanager +@signature_safe_contextmanager def guard(new_generator=None): if isinstance(new_generator, six.string_types): new_generator = UniqueNameGenerator(new_generator) diff --git a/python/paddle/fluid/wrapped_decorator.py b/python/paddle/fluid/wrapped_decorator.py new file mode 100644 index 0000000000000000000000000000000000000000..7e7dbff65611e947d1a11a0c33c6ecc27e6df636 --- /dev/null +++ b/python/paddle/fluid/wrapped_decorator.py @@ -0,0 +1,30 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import decorator +import contextlib + +__all__ = ['wrap_decorator', 'signature_safe_contextmanager'] + + +def wrap_decorator(decorator_func): + @decorator.decorator + def __impl__(func, *args, **kwargs): + wrapped_func = decorator_func(func) + return wrapped_func(*args, **kwargs) + + return __impl__ + + +signature_safe_contextmanager = wrap_decorator(contextlib.contextmanager) diff --git a/python/requirements.txt b/python/requirements.txt index 03d5e33e88cd5f1138ca8f6a6e885d6acfbc260e..5a70f1aa3ffc0ab6d4d148eb5bc26981784f2d32 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -11,3 +11,4 @@ graphviz six funcsigs pyyaml +decorator diff --git a/python/setup.py.in b/python/setup.py.in index f93f0cd130e33311bade2b15726c3eff37546214..a7c1e91f9c3a9597d799659a0abe3c9f56e54a57 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -100,6 +100,7 @@ packages=['paddle', 'paddle.utils', 'paddle.dataset', 'paddle.reader', + 'paddle.distributed', 'paddle.fluid', 'paddle.fluid.imperative', 'paddle.fluid.proto',