提交 77fc42d2 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into fea/jitkernel

...@@ -52,6 +52,7 @@ ExternalProject_Add( ...@@ -52,6 +52,7 @@ ExternalProject_Add(
PREFIX ${ANAKIN_SOURCE_DIR} PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS ${CMAKE_ARGS_PREFIX} CMAKE_ARGS ${CMAKE_ARGS_PREFIX}
-DUSE_LOGGER=YES
-DUSE_X86_PLACE=YES -DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO -DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
......
...@@ -27,7 +27,6 @@ endfunction() ...@@ -27,7 +27,6 @@ endfunction()
CheckCompilerCXX11Flag() CheckCompilerCXX11Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
# safe_set_flag # safe_set_flag
# #
# Set a compile flag only if compiler is support # Set a compile flag only if compiler is support
...@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name) ...@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name)
endif() endif()
endmacro() endmacro()
macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
if (BUILD_SHARED_LIBS)
return() # if build shared libs, the flags keep same with '/MD'
endif(BUILD_SHARED_LIBS)
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endmacro()
CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
if(NOT UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS)
...@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "") ...@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Common flags. the compiler flag used for C/C++ sources whenever release or debug # Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc. # Do not care if this flag is support for gcc.
# https://github.com/PaddlePaddle/Paddle/issues/12773
if (NOT WIN32)
set(COMMON_FLAGS set(COMMON_FLAGS
-fPIC -fPIC
-fno-omit-frame-pointer -fno-omit-frame-pointer
-Werror
-Wall -Wall
-Wextra -Wextra
-Wnon-virtual-dtor -Wnon-virtual-dtor
...@@ -114,11 +131,6 @@ set(COMMON_FLAGS ...@@ -114,11 +131,6 @@ set(COMMON_FLAGS
-Wno-error=terminate # Warning in PADDLE_ENFORCE -Wno-error=terminate # Warning in PADDLE_ENFORCE
) )
# https://github.com/PaddlePaddle/Paddle/issues/12773
if (NOT WIN32)
list(APPEND COMMON_FLAGS -Werror)
endif()
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
-fPIC -fPIC
-fno-omit-frame-pointer -fno-omit-frame-pointer
...@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS ...@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32)
if (APPLE) if (APPLE)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
# On Mac OS X build fat binaries with x86_64 architectures by default. # On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif() endif()
else() endif(APPLE)
if(LINUX)
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
-Wall -Wall
-Wextra -Wextra
-Werror -Werror
${GPU_COMMON_FLAGS}) ${GPU_COMMON_FLAGS})
endif() endif(LINUX)
if(UNIX AND NOT APPLE) if(UNIX AND NOT APPLE)
# except apple from nix*Os family # except apple from nix*Os family
set(LINUX TRUE) set(LINUX TRUE)
endif(UNIX AND NOT APPLE) endif(UNIX AND NOT APPLE)
foreach(flag ${COMMON_FLAGS}) foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cflag(CMAKE_C_FLAGS ${flag})
safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
endforeach() endforeach()
foreach(flag ${GPU_COMMON_FLAGS}) foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag}) safe_set_nvflag(${flag})
endforeach() endforeach()
if(WIN32)
# windows build turn off warnings.
safe_set_static_flag()
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/W3")
endforeach(flag_var)
endif(WIN32)
add_custom_target(paddle_apis ALL add_custom_target(paddle_apis ALL
DEPENDS paddle_v2_apis paddle_fluid_apis) DEPENDS paddle_v2_apis)
add_custom_target(paddle_docs ALL add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn
paddle_mobile_docs paddle_mobile_docs_cn) paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2) add_subdirectory(v2)
add_subdirectory(fluid)
add_subdirectory(mobile) add_subdirectory(mobile)
...@@ -102,8 +102,8 @@ class Float16Transpiler: ...@@ -102,8 +102,8 @@ class Float16Transpiler:
continue continue
for input_arg in current_op.input_arg_names: for input_arg in current_op.input_arg_names:
if input_arg in self.input_map: if input_arg in self.input_map:
current_op.rename_input(input_arg, current_op._rename_input(input_arg,
self.input_map[input_arg]) self.input_map[input_arg])
def _remove_unused_var(self): def _remove_unused_var(self):
''' '''
...@@ -187,7 +187,7 @@ class Float16Transpiler: ...@@ -187,7 +187,7 @@ class Float16Transpiler:
shape=var.shape, shape=var.shape,
persistable=var.persistable) persistable=var.persistable)
find_op(var) find_op(var)
var.op.rename_output(var_name, tmp_var_name) var.op._rename_output(var_name, tmp_var_name)
self.block._insert_op( self.block._insert_op(
i, i,
type="cast", type="cast",
......
...@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords= ...@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
...@@ -38,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en ...@@ -38,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
...@@ -170,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_ ...@@ -170,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_
paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...@@ -241,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg ...@@ -241,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw ...@@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -315,13 +298,18 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs ...@@ -315,13 +298,18 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
......
...@@ -13,3 +13,5 @@ if(WITH_INFERENCE) ...@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
endif() endif()
add_subdirectory(train)
...@@ -56,9 +56,9 @@ else() ...@@ -56,9 +56,9 @@ else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
else() else()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
endif (NOT WIN32) endif (NOT WIN32)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
...@@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) ...@@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph graph_viz_pass multi_devices_graph_pass graph build_strategy
multi_devices_graph_print_pass multi_devices_graph_check_pass fast_threaded_ssa_graph_executor)
fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
endif() # NOT WIN32 endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
......
...@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu ...@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle ) # device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace details {
class ParallelExecutorPassBuilder : public ir::PassBuilder {
public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) {
// Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
// Add op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
// Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path",
new std::string(graph_path));
}
}
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
// Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy_.debug_graphviz_path_);
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
}
// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
}
private:
BuildStrategy strategy_;
};
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
const {
pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
return pass_builder_;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else
const bool use_cuda) const {
#endif
// Create a default one if not initialized by user.
if (!pass_builder_) {
CreatePassesFromStrategy();
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") {
pass->Erase("places");
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("params");
pass->SetNotOwned<const std::unordered_set<std::string>>("params",
&param_names);
pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
}
graph = pass->Apply(std::move(graph));
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
...@@ -15,6 +15,17 @@ ...@@ -15,6 +15,17 @@
#pragma once #pragma once
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -57,6 +68,30 @@ struct BuildStrategy { ...@@ -57,6 +68,30 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false}; bool enable_data_balance_{false};
// User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes
// from python side.
// A new PassBuilder is created based on configs defined above and
// passes are owned by the PassBuilder.
std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(
const ProgramDesc &main_program,
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
#else
const bool use_cuda) const;
#endif
private:
mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
}; };
} // namespace details } // namespace details
......
...@@ -20,79 +20,37 @@ namespace paddle { ...@@ -20,79 +20,37 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Change it to thread safe flags if needed. template <class T>
class ThreadUnsafeOwnershipFlags { class COWPtr {
public: public:
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} typedef std::shared_ptr<T> RefPtr;
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
private: private:
bool flag_; RefPtr m_sp;
};
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public: public:
// Ctor from raw pointer. COWPtr() : m_sp(nullptr) {}
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} explicit COWPtr(T* t) : m_sp(t) {}
// Move methods. Steal ownership from origin const T& Data() const { return *m_sp; }
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() { T* MutableData() {
ownership_.AcquireOwnershipOnce( DetachIfNotUnique();
[this] { payload_.reset(new T(*payload_)); }); return m_sp.get();
return payload_.get();
} }
private: void DetachIfNotUnique() {
// Actual data pointer. T* tmp = m_sp.get();
std::shared_ptr<T> payload_; if (!(tmp == nullptr || m_sp.unique())) {
Detach();
}
}
// Ownership flag. void Detach() {
OwnershipFlags ownership_; T* tmp = m_sp.get();
m_sp = RefPtr(new T(*tmp));
}
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -30,6 +30,14 @@ TEST(COWPtr, all) { ...@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap ...@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
if(WITH_MKLDNN) if (WITH_MKLDNN)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference)
endif() endif ()
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass inference) pass_library(fc_lstm_fuse_pass inference)
...@@ -41,12 +41,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass ...@@ -41,12 +41,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if(WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
endif() endif ()
...@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl( ...@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
PDPattern external_pattern, subblock_pattern; PDPattern external_pattern, subblock_pattern;
// Use the following variables to tell whether this model is RNN1.
// This fuse can only works on the RNN1 model.
std::unordered_set<std::string> specified_vars({"data_lod_attention",
"cell_init", "hidden_init",
"data", "week", "minute"});
int count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsVar() && specified_vars.count(node->Name())) {
++count;
}
}
if (count < specified_vars.size()) {
return graph;
}
// Continue to fuse.
FindWhileOp(graph.get()); FindWhileOp(graph.get());
return graph; return graph;
} }
......
...@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
const std::string BatchedCellPreAct = const std::string BatchedCellPreAct =
patterns::UniqueKey("BatchedCellPreAct"); patterns::UniqueKey("BatchedCellPreAct");
const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>(); scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>(); scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>(); scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
...@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
op_desc.SetOutput("BatchedGate", {BatchedGate}); op_desc.SetOutput("BatchedGate", {BatchedGate});
op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
op_desc.SetOutput("BatchedInput", {BatchedInput}); op_desc.SetOutput("BatchedInput", {BatchedInput});
op_desc.SetOutput("CheckedCell", {CheckedCell});
op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr // TODO(TJ): get from attr
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm> #include <algorithm>
#include <deque>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( ...@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return adj_list; return adj_list;
} }
size_t GraphNum(const Graph &graph) {
std::unordered_set<ir::Node *> nodes = graph.Nodes();
std::unordered_set<ir::Node *> visited_nodes;
visited_nodes.reserve(nodes.size());
std::deque<ir::Node *> q_nodes;
std::vector<std::unordered_set<ir::Node *>> graph_nodes;
std::unordered_set<ir::Node *> g_nodes;
size_t graph_count = 0;
auto traverse_nodes = [&visited_nodes,
&q_nodes](const std::vector<ir::Node *> &nodes) {
std::copy_if(
nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
[&visited_nodes](Node *node) { return !visited_nodes.count(node); });
};
while (visited_nodes.size() != nodes.size()) {
if (!q_nodes.empty()) {
auto cur_node = q_nodes.front();
q_nodes.pop_front();
visited_nodes.insert(cur_node);
g_nodes.insert(cur_node);
traverse_nodes(cur_node->inputs);
traverse_nodes(cur_node->outputs);
} else {
++graph_count;
if (g_nodes.size()) {
graph_nodes.emplace_back(g_nodes);
}
g_nodes.clear();
for (auto &n : nodes) {
if (visited_nodes.count(n) == 0) {
q_nodes.push_back(n);
break;
}
}
}
}
if (g_nodes.size()) {
graph_nodes.emplace_back(g_nodes);
}
if (VLOG_IS_ON(10)) {
VLOG(10) << "graph_num: " << graph_nodes.size();
for (auto &g_n : graph_nodes) {
VLOG(10) << "graph_nodes: " << g_n.size();
if (g_n.size() < 10) {
std::stringstream out;
for (auto &node : g_n) {
out << "\nNode: " << node->Name() << " in [";
for (auto &n : node->inputs) {
out << n->Name() << ", ";
}
out << "], out[";
for (auto &n : node->outputs) {
out << n->Name() << ", ";
}
out << "]";
}
VLOG(10) << out.str();
}
}
}
return graph_count;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -27,6 +27,8 @@ namespace ir { ...@@ -27,6 +27,8 @@ namespace ir {
// Test if the graph contains circle. // Test if the graph contains circle.
bool HasCircle(const Graph &graph); bool HasCircle(const Graph &graph);
size_t GraphNum(const Graph &graph);
// Topology Sort the operations in the graph from inputs to outputs. // Topology Sort the operations in the graph from inputs to outputs.
// `graph` cannot contain circle. // `graph` cannot contain circle.
std::vector<ir::Node *> TopologySortOperations(const Graph &graph); std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
......
...@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) { ...@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) {
ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_EQ(node_map.at("op2"), 1UL);
ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
} }
void BuildZeroGraph(Graph* g) {}
void BuildOneGraph(Graph* g) {
ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
// o1->v1->o2
o1->outputs.push_back(v1);
o2->inputs.push_back(v1);
v1->inputs.push_back(o1);
v1->outputs.push_back(o2);
// o2->v2->o3
// o2->v2->o4
o2->outputs.push_back(v2);
o3->inputs.push_back(v2);
o4->inputs.push_back(v2);
v2->inputs.push_back(o2);
v2->outputs.push_back(o3);
v2->outputs.push_back(o4);
// o2->v3->o5
o2->outputs.push_back(v3);
o5->inputs.push_back(v3);
v3->inputs.push_back(o2);
v3->outputs.push_back(o5);
// o3-v4->o5
o3->outputs.push_back(v4);
o5->inputs.push_back(v4);
v4->inputs.push_back(o3);
v4->outputs.push_back(o5);
}
void BuildTwoGraphs(Graph* g) {
ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
// o1->v1->o2
o1->outputs.push_back(v1);
o2->inputs.push_back(v1);
v1->inputs.push_back(o1);
v1->outputs.push_back(o2);
// o2->v2->o3
// o2->v2->o4
o2->outputs.push_back(v2);
o3->inputs.push_back(v2);
o4->inputs.push_back(v2);
v2->inputs.push_back(o2);
v2->outputs.push_back(o3);
v2->outputs.push_back(o4);
// o2->v3->o5
// o2->outputs.push_back(v3);
o5->inputs.push_back(v3);
// v3->inputs.push_back(o2);
v3->outputs.push_back(o5);
// o3-v4->o5
o3->outputs.push_back(v4);
// o5->inputs.push_back(v4);
v4->inputs.push_back(o3);
// v4->outputs.push_back(o5);
}
TEST(GraphHelperTest, GraphNum) {
ProgramDesc prog;
Graph g(prog);
BuildZeroGraph(&g);
ASSERT_EQ(GraphNum(g), 0);
Graph g2(prog);
BuildOneGraph(&g2);
ASSERT_EQ(GraphNum(g2), 1);
Graph g3(prog);
BuildTwoGraphs(&g3);
ASSERT_EQ(GraphNum(g3), 2);
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
......
...@@ -19,7 +19,6 @@ namespace paddle { ...@@ -19,7 +19,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const { std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
for (const std::string& attr : required_pass_attrs_) { for (const std::string& attr : required_pass_attrs_) {
PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
......
...@@ -42,6 +42,8 @@ class Pass { ...@@ -42,6 +42,8 @@ class Pass {
attr_dels_.clear(); attr_dels_.clear();
} }
std::string Type() const { return type_; }
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const; std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
// Get a reference to the attributed previously set. // Get a reference to the attributed previously set.
...@@ -52,6 +54,21 @@ class Pass { ...@@ -52,6 +54,21 @@ class Pass {
return *boost::any_cast<AttrType *>(attrs_.at(attr_name)); return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
} }
bool Has(const std::string &attr_name) const {
return attrs_.find(attr_name) != attrs_.end();
}
void Erase(const std::string &attr_name) {
if (!Has(attr_name)) {
return;
}
if (attr_dels_.find(attr_name) != attr_dels_.end()) {
attr_dels_[attr_name]();
attr_dels_.erase(attr_name);
}
attrs_.erase(attr_name);
}
// Set a pointer to the attribute. Pass takes ownership of the attribute. // Set a pointer to the attribute. Pass takes ownership of the attribute.
template <typename AttrType> template <typename AttrType>
void Set(const std::string &attr_name, AttrType *attr) { void Set(const std::string &attr_name, AttrType *attr) {
...@@ -68,13 +85,15 @@ class Pass { ...@@ -68,13 +85,15 @@ class Pass {
// should delete the attribute. // should delete the attribute.
template <typename AttrType> template <typename AttrType>
void SetNotOwned(const std::string &attr_name, AttrType *attr) { void SetNotOwned(const std::string &attr_name, AttrType *attr) {
PADDLE_ENFORCE(attrs_.count(attr_name) == 0); PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
attr_name);
attrs_[attr_name] = attr; attrs_[attr_name] = attr;
} }
protected: protected:
virtual std::unique_ptr<Graph> ApplyImpl( virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
std::unique_ptr<Graph> graph) const = 0; LOG(FATAL) << "Calling virtual Pass not implemented.";
}
private: private:
template <typename PassType> template <typename PassType>
...@@ -89,7 +108,10 @@ class Pass { ...@@ -89,7 +108,10 @@ class Pass {
required_graph_attrs_.insert(attrs.begin(), attrs.end()); required_graph_attrs_.insert(attrs.begin(), attrs.end());
} }
void RegisterType(const std::string &type) { type_ = type; }
mutable bool applied_{false}; mutable bool applied_{false};
std::string type_;
std::unordered_set<std::string> required_pass_attrs_; std::unordered_set<std::string> required_pass_attrs_;
std::unordered_set<std::string> required_graph_attrs_; std::unordered_set<std::string> required_graph_attrs_;
std::map<std::string, boost::any> attrs_; std::map<std::string, boost::any> attrs_;
...@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar { ...@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
"'%s' is registered more than once.", pass_type); "'%s' is registered more than once.", pass_type);
PassRegistry::Instance().Insert( PassRegistry::Instance().Insert(
pass_type, [this]() -> std::unique_ptr<Pass> { pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
std::unique_ptr<Pass> pass(new PassType()); std::unique_ptr<Pass> pass(new PassType());
pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
pass->RegisterType(pass_type);
return pass; return pass;
}); });
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/pass_builder.h"
namespace paddle {
namespace framework {
namespace ir {
std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
auto pass = ir::PassRegistry::Instance().Get(pass_type);
passes_.emplace_back(pass.release());
return passes_.back();
}
void PassBuilder::RemovePass(size_t idx) {
PADDLE_ENFORCE(passes_.size() > idx);
passes_.erase(passes_.begin() + idx);
}
std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
const std::string& pass_type) {
PADDLE_ENFORCE(passes_.size() >= idx);
std::shared_ptr<Pass> pass(
ir::PassRegistry::Instance().Get(pass_type).release());
passes_.insert(passes_.begin() + idx, std::move(pass));
return passes_[idx];
}
} // namespace ir
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class PassBuilder {
public:
PassBuilder() {}
virtual ~PassBuilder() {}
// Append a new pass to the end.
std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
// Insert a new pass after `idx`.
std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
// Remove a new pass at `idx`.
void RemovePass(size_t idx);
// Returns a list of all passes.
std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
protected:
std::vector<std::shared_ptr<Pass>> passes_;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) { ...@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2); ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
try { // Allow apply more than once.
graph = pass->Apply(std::move(graph)); graph.reset(new Graph(prog));
} catch (paddle::platform::EnforceNotMet e) { graph->Set<int>("test_graph_attr", new int);
exception = std::string(e.what()); graph = pass->Apply(std::move(graph));
}
ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
pass = PassRegistry::Instance().Get("test_pass"); pass = PassRegistry::Instance().Get("test_pass");
pass->SetNotOwned<int>("test_pass_attr", &val); pass->SetNotOwned<int>("test_pass_attr", &val);
......
...@@ -17,10 +17,13 @@ ...@@ -17,10 +17,13 @@
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -28,206 +31,436 @@ namespace paddle { ...@@ -28,206 +31,436 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
class Vector { class Vector {
public: public:
using value_type = T; using value_type = T;
using iterator = typename std::vector<T>::iterator;
using const_iterator = typename std::vector<T>::const_iterator;
// Default ctor. Create empty Vector private:
Vector() { InitEmpty(); } // The actual class to implement vector logic
class VectorData {
public:
VectorData() : flag_(kDataInCPU) {}
VectorData(size_t count, const T &value)
: cpu_(count, value), flag_(kDataInCPU) {}
VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
template <typename U>
explicit VectorData(const std::vector<U> &dat)
: cpu_(dat), flag_(kDataInCPU) {}
~VectorData() {}
VectorData(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
}
// Fill vector with value. The vector size is `count`. VectorData &operator=(const VectorData &o) {
explicit Vector(size_t count, const T &value = T()) { o.ImmutableCPU();
InitEmpty(); cpu_ = o.cpu_;
if (count != 0) { flag_ = kDataInCPU;
resize(count); details::CUDABuffer null;
T *ptr = begin(); gpu_.Swap(null);
for (size_t i = 0; i < count; ++i) { return *this;
ptr[i] = value; }
T &operator[](size_t i) {
MutableCPU();
return cpu_[i];
}
const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_[i];
}
size_t size() const { return cpu_.size(); }
iterator begin() {
MutableCPU();
return cpu_.begin();
}
iterator end() {
MutableCPU();
return cpu_.end();
}
T &front() {
MutableCPU();
return cpu_.front();
}
T &back() {
MutableCPU();
return cpu_.back();
}
const_iterator begin() const {
ImmutableCPU();
return cpu_.begin();
}
const_iterator end() const {
ImmutableCPU();
return cpu_.end();
}
const T &back() const {
ImmutableCPU();
return cpu_.back();
}
T *data() { return &(*this)[0]; }
const T *data() const { return &(*this)[0]; }
const T &front() const {
ImmutableCPU();
return cpu_.front();
}
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
MutableCPU();
cpu_.assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) {
MutableCPU();
cpu_.push_back(elem);
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
MutableCPU();
auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
std::copy(begin, end, out_it);
}
// resize the vector
void resize(size_t size) {
MutableCPU();
cpu_.resize(size);
}
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_);
}
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T *>(ptr);
}
// clear
void clear() {
cpu_.clear();
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { return cpu_.capacity(); }
// reserve data
void reserve(size_t size) const { cpu_.reserve(size); }
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const {
ImmutableCPU();
return cpu_;
}
bool operator==(const VectorData &other) const {
ImmutableCPU();
other.ImmutableCPU();
return cpu_ == other.cpu_;
}
std::mutex &Mutex() const { return mtx_; }
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) {
return nullptr;
} else {
return std::unique_ptr<platform::CUDAPlace>(
new platform::CUDAPlace(gpu_.place_));
} }
} }
}
// Ctor with init_list private:
Vector(std::initializer_list<T> init) { enum DataFlag {
if (init.size() == 0) { kDataInCPU = 0x01,
InitEmpty(); kDataInCUDA = 0x02,
} else { // kDirty means the data has been changed in one device.
InitByIter(init.size(), init.begin(), init.end()); kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::Place(gpu_.place_)));
auto stream = dev_ctx->stream();
void *src = gpu_.data_;
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
stream);
dev_ctx->Wait();
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
} }
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen");
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen.");
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
}
void ImmutableCPU() const {
if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or
// CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; }
bool IsInCUDA() const { return flag_ & kDataInCUDA; }
bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable int flag_;
mutable std::mutex mtx_;
};
public:
// Default ctor. Create empty Vector
Vector() : m_(new VectorData()) {}
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T())
: m_(new VectorData(count, value)) {}
// Ctor with init_list
Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
// implicit cast from std::vector. // implicit cast from std::vector.
template <typename U> template <typename U>
Vector(const std::vector<U> &dat) { // NOLINT Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
}
} }
// Copy ctor // Copy ctor
Vector(const Vector<T> &other) { this->operator=(other); } Vector(const Vector<T> &other) { m_ = other.m_; }
// Copy operator // Copy operator
Vector<T> &operator=(const Vector<T> &other) { Vector<T> &operator=(const Vector<T> &other) {
if (other.size() != 0) { m_ = other.m_;
this->InitByIter(other.size(), other.begin(), other.end());
} else {
InitEmpty();
}
return *this; return *this;
} }
// Move ctor // Move ctor
Vector(Vector<T> &&other) { Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
}
}
// CPU data access method. Mutable. // CPU data access method. Mutable.
T &operator[](size_t i) { T &operator[](size_t i) { return (*m_.MutableData())[i]; }
MutableCPU();
return const_cast<T *>(cpu_vec_.data<T>())[i];
}
// CPU data access method. Immutable. // CPU data access method. Immutable.
const T &operator[](size_t i) const { const T &operator[](size_t i) const { return m_.Data()[i]; }
ImmutableCPU();
return cpu_vec_.data<T>()[i];
}
// std::vector iterator methods. Based on CPU data access method // std::vector iterator methods. Based on CPU data access method
size_t size() const { return size_; } size_t size() const { return m_.Data().size(); }
T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } iterator begin() { return m_.MutableData()->begin(); }
T *end() { iterator end() { return m_.MutableData()->end(); }
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
T &front() { return *begin(); } T &front() { return m_.MutableData()->front(); }
T &back() { T &back() { return m_.MutableData()->back(); }
auto it = end();
--it;
return *it;
}
const T *begin() const { const_iterator begin() const { return m_.Data().begin(); }
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
}
const T *end() const { const_iterator end() const { return m_.Data().end(); }
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
const T *cbegin() const { return begin(); } const_iterator cbegin() const { return begin(); }
const T *cend() const { return end(); } const_iterator cend() const { return end(); }
const T &back() const { const T &back() const { return m_.Data().back(); }
auto it = end();
--it;
return *it;
}
T *data() { return begin(); } T *data() { return m_.MutableData()->data(); }
const T *data() const { return begin(); } const T *data() const { return m_.Data().data(); }
const T &front() const { return *begin(); } const T &front() const { return m_.Data().front(); }
// end of std::vector iterator methods // end of std::vector iterator methods
// assign this from iterator. // assign this from iterator.
// NOTE: the iterator must support `end-begin` // NOTE: the iterator must support `end-begin`
template <typename Iter> template <typename Iter>
void assign(Iter begin, Iter end) { void assign(Iter begin, Iter end) {
InitByIter(end - begin, begin, end); m_.MutableData()->assign(begin, end);
} }
// push_back. If the previous capacity is not enough, the memory will // push_back. If the previous capacity is not enough, the memory will
// double. // double.
void push_back(T elem) { void push_back(T elem) { m_.MutableData()->push_back(elem); }
if (size_ + 1 > capacity()) {
reserve((size_ + 1) << 1);
}
*end() = elem;
++size_;
}
// extend a vector by iterator. // extend a vector by iterator.
// NOTE: the iterator must support end-begin // NOTE: the iterator must support end-begin
template <typename It> template <typename It>
void Extend(It begin, It end) { void Extend(It begin, It end) {
size_t pre_size = size_; m_.MutableData()->Extend(begin, end);
resize(pre_size + (end - begin));
T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
} }
// resize the vector // resize the vector
void resize(size_t size) { void resize(size_t size) {
if (size + 1 <= capacity()) { if (m_.Data().size() != size) {
size_ = size; m_.MutableData()->resize(size);
} else {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
} }
} }
// get cuda ptr. immutable // get cuda ptr. immutable
const T *CUDAData(platform::Place place) const { const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place), {
"CUDA Data must on CUDA place"); auto &mtx = m_.Data().Mutex();
ImmutableCUDA(place); std::lock_guard<std::mutex> guard(mtx);
return cuda_vec_.data<T>(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAData(place);
} }
// get cuda ptr. mutable // get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) { T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place); {
flag_ = kDirty | kDataInCUDA; auto &mtx = m_.Data().Mutex();
return const_cast<T *>(ptr); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAMutableData(place);
} }
// clear // clear
void clear() { void clear() { m_.MutableData()->clear(); }
size_ = 0;
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { size_t capacity() const { return m_.Data().capacity(); }
return cpu_vec_.memory_size() / SizeOfType(typeid(T));
}
// reserve data // reserve data
void reserve(size_t size) { void reserve(size_t size) { m_.Data().reserve(size); }
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
// the unify method to access CPU or CUDA data. immutable. // the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const { const T *Data(platform::Place place) const {
...@@ -248,12 +481,7 @@ class Vector { ...@@ -248,12 +481,7 @@ class Vector {
} }
// implicit cast operator. Vector can be cast to std::vector implicitly. // implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const { operator std::vector<T>() const { return m_.Data(); }
std::vector<T> result;
result.resize(size());
std::copy(begin(), end(), result.begin());
return result;
}
bool operator==(const Vector<T> &other) const { bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false; if (size() != other.size()) return false;
...@@ -267,118 +495,11 @@ class Vector { ...@@ -267,118 +495,11 @@ class Vector {
return true; return true;
} }
private: const void *Handle() const { return &m_.Data(); }
void InitEmpty() {
size_ = 0;
flag_ = kDataInCPU;
}
template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace();
T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++;
}
flag_ = kDataInCPU | kDirty;
size_ = size;
}
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void ImmutableCPU() const {
if (IsDirty() &&
!IsInCPU()) { // If data has been changed in CUDA, or CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; } private:
// Vector is an COW object.
bool IsInCUDA() const { return flag_ & kDataInCUDA; } mutable details::COWPtr<VectorData> m_;
bool IsInCPU() const { return flag_ & kDataInCPU; }
static void WaitPlace(const platform::Place place) {
if (platform::is_gpu_place(place)) {
platform::DeviceContextPool::Instance()
.Get(boost::get<platform::CUDAPlace>(place))
->Wait();
}
}
static T &EmptyDummy() {
static T dummy = T();
return dummy;
}
mutable int flag_;
mutable Tensor cpu_vec_;
mutable Tensor cuda_vec_;
size_t size_;
}; };
#else // PADDLE_WITH_CUDA #else // PADDLE_WITH_CUDA
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
// These code can be shared with Executor.
static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::CHANNEL) {
var->GetMutable<ChannelHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
var_type);
}
}
void NaiveExecutor::Prepare(Scope *parent_scope,
const ProgramDesc &program_desc, int block_id,
bool with_feed_fetch_ops) {
if (!parent_scope) {
scope_ = new framework::Scope;
} else {
scope_ = &parent_scope->NewScope();
}
CreateVariables(program_desc, scope_, block_id);
CreateOps(program_desc, block_id, with_feed_fetch_ops);
}
void NaiveExecutor::Run() {
for (auto &op : ops_) {
VLOG(4) << "run " << op->Type();
op->Run(*scope_, place_);
}
}
void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
int block_id) {
PADDLE_ENFORCE(scope);
auto &global_block = desc.Block(block_id);
const Scope *ancestor_scope = scope;
while (ancestor_scope->parent()) {
ancestor_scope = ancestor_scope->parent();
}
if (ancestor_scope != scope) {
for (auto &var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
// Create persistable vars in ancestor scope.
if (var->Persistable()) {
auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else { // Create temporary variables in local scope.
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto &var : global_block.AllVars()) {
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
bool with_feed_fetch_ops) {
for (const auto &op_desc : desc.Block(block_id).AllOps()) {
if (!with_feed_fetch_ops &&
(op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s",
op_desc->Input("X")[0], op_desc->Type(),
op_desc->Output("Out")[0]);
continue;
}
ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
}
}
LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
PADDLE_ENFORCE(scope_, "Need to init scope first");
auto *var = scope_->FindVar(name);
PADDLE_ENFORCE(var, "No variable [%s] in the scope");
auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
return tensor;
}
void NaiveExecutor::CleanFeedFetchOps() {
std::vector<std::unique_ptr<OperatorBase>> ops;
for (auto &op : ops_) {
if (op->Type() != "feed" && op->Type() != "fetch") {
ops.emplace_back(std::move(op));
}
}
ops_.swap(ops);
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
/*
* Simple, intuitive and effective. Only single thread is supported, and
* currently designed for inference.
*/
class NaiveExecutor {
public:
explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
// Create child scope.
// Create variables.
// @with_feed_fetch_ops: whether to work with the feed and fetch operators.
void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
int block_id, bool with_feed_fetch_ops);
// Run all the operators.
void Run();
// Get an tensor to operating directly, without the need for feed_ops.
LoDTensor* FindTensor(const std::string& name);
Scope* scope() { return scope_; }
void CleanFeedFetchOps();
protected:
void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
void CreateOps(const ProgramDesc& desc, int block_id,
bool with_feed_fetch_ops);
private:
const platform::Place place_;
// Catch the required resource to avoid recreate.
std::vector<std::unique_ptr<OperatorBase>> ops_;
Scope* scope_;
};
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include <gtest/gtest.h>
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
TEST(NaiveExecutor, Basic) {
ProgramDesc program;
auto* main_block = program.MutableBlock(0);
auto* a = main_block->Var("a"); // input
auto* b = main_block->Var("b"); // input
auto* c = main_block->Var("c"); // input
a->SetType(proto::VarType::LOD_TENSOR);
b->SetType(proto::VarType::LOD_TENSOR);
c->SetType(proto::VarType::LOD_TENSOR);
auto* add = main_block->AppendOp();
add->SetType("elementwise_add");
add->SetInput("X", {"a"});
add->SetInput("Y", {"b"});
add->SetOutput("Out", {"c"});
auto place = platform::CPUPlace();
NaiveExecutor exe(place);
exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
auto* a_tensor = exe.FindTensor("a");
auto* b_tensor = exe.FindTensor("b");
auto* c_tensor = exe.FindTensor("c");
a_tensor->Resize({1, 4});
b_tensor->Resize({1, 4});
c_tensor->Resize({1, 4});
b_tensor->mutable_data<float>(place);
a_tensor->mutable_data<float>(place);
float a_arr[] = {0, 1, 2, 3};
float b_arr[] = {0.0, .1, .2, .3};
std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
exe.Run();
auto* c_data = c_tensor->mutable_data<float>(place);
for (int i = 0; i < 4; i++) {
EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
}
}
} // namespace framework
} // namespace paddle
USE_OP(elementwise_add);
...@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
} }
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place)); if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
}
RunImpl(scope, place); RunImpl(scope, place);
if (VLOG_IS_ON(3)) { if (VLOG_IS_ON(3)) {
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} }
......
...@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and ...@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -35,80 +33,6 @@ limitations under the License. */ ...@@ -35,80 +33,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes, const bool use_cuda,
#ifdef PADDLE_WITH_CUDA
const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
#else
const BuildStrategy &strategy) {
#endif
// Convert the program to graph.
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
// Apply op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass =
ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
}
// Convert graph to run on multi-devices.
auto multi_devices_pass =
ir::PassRegistry::Instance().Get("multi_devices_pass");
multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
"params", &param_names);
multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
graph = multi_devices_pass->Apply(std::move(graph));
// Apply a graph print pass to record a graph with device info.
if (!strategy.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass =
ir::PassRegistry::Instance().Get("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy.debug_graphviz_path_);
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
graph = multi_devices_print_pass->Apply(std::move(graph));
}
// Verify that the graph is correct for multi-device executor.
auto multi_devices_check_pass =
ir::PassRegistry::Instance().Get("multi_devices_check_pass");
graph = multi_devices_check_pass->Apply(std::move(graph));
return graph;
}
class ParallelExecutorPrivate { class ParallelExecutorPrivate {
public: public:
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places) explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
...@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor( ...@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params, main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, build_strategy, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
member_->nccl_ctxs_.get());
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
...@@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor( ...@@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor(
} }
} }
#else #else
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass( std::unique_ptr<ir::Graph> graph =
main_program, member_->places_, loss_var_name, params, build_strategy.Apply(main_program, member_->places_, loss_var_name,
member_->local_scopes_, member_->use_cuda_, build_strategy); params, member_->local_scopes_, member_->use_cuda_);
#endif #endif
// If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
"The number of graph should be only one");
}
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph))); exec_strategy, member_->local_scopes_, places, std::move(graph)));
...@@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
USE_PASS(reference_count_pass); USE_PASS(reference_count_pass);
#endif #endif
...@@ -14,14 +14,14 @@ limitations under the License. */ ...@@ -14,14 +14,14 @@ limitations under the License. */
#pragma once #pragma once
#include <paddle/fluid/framework/details/build_strategy.h>
#include <atomic> #include <atomic>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
......
...@@ -20,6 +20,13 @@ limitations under the License. */ ...@@ -20,6 +20,13 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
// The mutex is not needed by training and inference, only for distribution.
#if PADDLE_WITH_DISTRIBUTE
#define WITH_LOCK 1
#else
#define WITH_LOCK 0
#endif
DEFINE_bool(benchmark, false, DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs." "and add some memory usage logs."
...@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() { ...@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
...@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) { ...@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
return FindVarInternal(name); return FindVarInternal(name);
} }
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size()); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) { for (auto& p : vars_) {
...@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
...@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
...@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
#if WITH_LOCK
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
#endif
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
......
...@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test { ...@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
selected_rows_.reset(new SelectedRows(rows, height)); selected_rows_.reset(new SelectedRows(rows, height));
Tensor* value = selected_rows_->mutable_value(); Tensor* value = selected_rows_->mutable_value();
value->mutable_data<float>( auto* data = value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_); make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
for (int64_t i = 0; i < value->numel(); ++i) {
data[i] = static_cast<float>(i);
}
} }
protected: protected:
...@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ...@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
auto* dst_data = dst_tensor.value().data<float>();
for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
ASSERT_EQ(dst_data[i], static_cast<float>(i));
}
} }
TEST(SelectedRows, SparseTable) { TEST(SelectedRows, SparseTable) {
......
...@@ -53,7 +53,7 @@ if(NOT APPLE) ...@@ -53,7 +53,7 @@ if(NOT APPLE)
endif() endif()
if(WITH_TESTING) if(WITH_TESTING)
# tests/book depends the models that generated by python/paddle/fluid/tests/book # tests/book depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book) add_subdirectory(tests/book)
if(WITH_INFERENCE_API_TEST) if(WITH_INFERENCE_API_TEST)
add_subdirectory(tests/api) add_subdirectory(tests/api)
......
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
set(analysis_deps set(analysis_deps
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc analyzer.cc
......
...@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) { ...@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
TEST(Analyzer, analysis_with_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true; FLAGS_IA_enable_tensorrt_subgraph_engine = true;
Argument argument; Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(0));
argument.Set<int>("max_batch_size", new int(3));
argument.Set<int>("workspace_size", new int(1 << 20));
argument.Set<std::string>("precision_mode", new std::string("FP32"));
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
void TestWord2vecPrediction(const std::string &model_path) { void TestWord2vecPrediction(const std::string& model_path) {
NativeConfig config; NativeConfig config;
config.model_dir = model_path; config.model_dir = model_path;
config.use_gpu = false; config.use_gpu = false;
...@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) { ...@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << "data: " LOG(INFO) << "data: "
<< static_cast<float *>(outputs.front().data.data())[i]; << static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i], PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]); result[i]);
} }
} }
......
...@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { ...@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
} }
} }
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, void CreateTrtEngineOp(Node *node, Argument *argument,
framework::proto::BlockDesc *block) { framework::proto::BlockDesc *block) {
PADDLE_ENFORCE(argument->main_dfg.get());
const DataFlowGraph &graph = *(argument->main_dfg);
static int counter{0}; static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock()); PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc; framework::OpDesc desc;
...@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, ...@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
// Set attrs // Set attrs
SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
SetAttr(desc.Proto(), "output_name_mapping", output_mapping); SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
...@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { ...@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
*block_desc.Proto()->mutable_vars() = *block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars(); argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); CreateTrtEngineOp(node, argument_, block_desc.Proto());
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops(); auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
......
...@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } ...@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
void SubGraphFuse::ReplaceNodesWithSubGraphs() { void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) { for (auto &subgraph : subgraphs) {
if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
continue;
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end()); std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block // replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/inference/analysis/node.h"
...@@ -63,8 +64,11 @@ class SubGraphFuse { ...@@ -63,8 +64,11 @@ class SubGraphFuse {
public: public:
using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
: graph_(graph), node_inside_subgraph_teller_(teller) {} Argument *argument)
: graph_(graph),
node_inside_subgraph_teller_(teller),
argument_(argument) {}
// The main method which run all the logic. // The main method which run all the logic.
void operator()(); void operator()();
...@@ -76,6 +80,7 @@ class SubGraphFuse { ...@@ -76,6 +80,7 @@ class SubGraphFuse {
private: private:
DataFlowGraph *graph_; DataFlowGraph *graph_;
NodeInsideSubgraphTeller node_inside_subgraph_teller_; NodeInsideSubgraphTeller node_inside_subgraph_teller_;
Argument *argument_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) { ...@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
TEST(SubGraphSplitter, Fuse) { TEST(SubGraphSplitter, Fuse) {
auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(3));
size_t count0 = dfg.nodes.size(); size_t count0 = dfg.nodes.size();
SubGraphFuse fuse(&dfg, teller); SubGraphFuse fuse(&dfg, teller, &argument);
fuse(); fuse();
int count1 = 0; int count1 = 0;
......
...@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( ...@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
: node_inside_subgraph_teller_(teller) {} : node_inside_subgraph_teller_(teller) {}
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_)(); SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
VLOG(4) << "debug info " VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/, << graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/); true /*show_functions*/);
......
...@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
bool Initialize(Argument* argument) override { return true; } bool Initialize(Argument* argument) override {
argument_ = argument;
return true;
}
// This class get a sub-graph as input and determine whether to transform this // This class get a sub-graph as input and determine whether to transform this
// sub-graph into TensorRT. // sub-graph into TensorRT.
...@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
private: private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_; NodeInsideSubgraphTeller node_inside_subgraph_teller_;
Argument* argument_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) { ...@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
}; };
Argument argument(FLAGS_inference_model_dir); Argument argument(FLAGS_inference_model_dir);
argument.Set<int>("minimum_subgraph_size", new int(0));
argument.Set<int>("max_batch_size", new int(3));
argument.Set<int>("workspace_size", new int(1 << 20));
argument.Set<std::string>("precision_mode", new std::string("FP32"));
DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
......
...@@ -18,10 +18,10 @@ if(APPLE) ...@@ -18,10 +18,10 @@ if(APPLE)
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
endif() endif()
function(inference_api_test TARGET_NAME) function(inference_api_test TARGET_NAME)
...@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME) ...@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
endif(WITH_TESTING) endif(WITH_TESTING)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
...@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api ...@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
inference_api_test(test_api_impl SRC api_impl_tester.cc inference_api_test(test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif() endif()
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
function(anakin_target target_name) function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction() endfunction()
......
...@@ -16,11 +16,15 @@ ...@@ -16,11 +16,15 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -28,8 +32,11 @@ DECLARE_bool(profile); ...@@ -28,8 +32,11 @@ DECLARE_bool(profile);
namespace paddle { namespace paddle {
using contrib::AnalysisConfig;
bool AnalysisPredictor::Init( bool AnalysisPredictor::Init(
const std::shared_ptr<framework::Scope>& parent_scope) { const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program) {
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
#if !defined(_WIN32) #if !defined(_WIN32)
if (FLAGS_profile) { if (FLAGS_profile) {
...@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init( ...@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently"; LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
"is turned false.";
config_.enable_ir_optim = false; config_.enable_ir_optim = false;
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
...@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init( ...@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
scope_.reset(new paddle::framework::Scope()); scope_.reset(new paddle::framework::Scope());
} }
executor_.reset(new paddle::framework::Executor(place_)); executor_.reset(new paddle::framework::NaiveExecutor(place_));
// Initialize the inference program if (!program) {
if (!config_.model_dir.empty()) { if (!LoadProgramDesc()) return false;
// Parameters are saved in separate files sited in OptimizeInferenceProgram();
// the specified `dirname`.
inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else { } else {
LOG(ERROR) << "fail to load inference model from " << config_.model_dir; inference_program_ = program;
}
executor_->Prepare(scope_.get(), *inference_program_, 0,
config_.use_feed_fetch_ops);
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch();
return true;
}
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
VLOG(3) << "Predictor::predict";
inference::Timer timer;
timer.tic();
// set feed variable
std::vector<framework::LoDTensor> feeds;
framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
if (!SetFeed(inputs, scope)) {
LOG(ERROR) << "fail to set feed";
return false; return false;
} }
// Run the inference program
// if share variables, we need not create variables
executor_->Run();
OptimizeInferenceProgram(); // get fetch variable
if (config_._use_mkldnn) { if (!GetFetch(output_data, scope)) {
executor_->EnableMKLDNN(*inference_program_); LOG(ERROR) << "fail to get fetches";
return false;
} }
ctx_ = executor_->Prepare(*inference_program_, 0); VLOG(3) << "predict cost: " << timer.toc() << "ms";
return true;
}
VLOG(5) << "to create variables"; bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
PADDLE_ENFORCE(scope_.get()); framework::Scope *scope) {
executor_->CreateVariables(*inference_program_, VLOG(3) << "Predictor::set_feed";
sub_scope_ ? sub_scope_ : scope_.get(), 0); if (inputs.size() != feeds_.size()) {
// Get the feed_target_names and fetch_target_names LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
PrepareFeedFetch(); << inputs.size();
return false;
}
// Cache the inputs memory for better concurrency performance.
feed_tensors_.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto &input = feed_tensors_[i];
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {
lod.emplace_back(level);
}
input.set_lod(lod);
int idx = -1;
if (config_.specify_input_name) {
idx = feed_names_[inputs[i].name];
} else {
idx = boost::get<int>(feeds_[i]->GetAttr("col"));
}
framework::SetFeedVariable(scope, input, "feed", idx);
}
return true;
}
template <typename T>
void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
PaddleTensor *output) {
// set shape.
auto shape = framework::vectorize(fetch.dims());
output->shape.assign(shape.begin(), shape.end());
// set data.
const T *data = fetch.data<T>();
int num_elems = inference::VecReduceToInt(shape);
output->data.Resize(num_elems * sizeof(T));
// The fetched tensor output by fetch op, should always in CPU memory, so just
// copy.
memcpy(output->data.data(), data, num_elems * sizeof(T));
// set lod
output->lod.clear();
for (auto &level : fetch.lod()) {
output->lod.emplace_back(level.begin(), level.end());
}
}
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type();
auto output = &(outputs->at(i));
if (type == typeid(float)) {
GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32;
} else if (type == typeid(int64_t)) {
GetFetchOne<int64_t>(fetch, output);
output->dtype = PaddleDType::INT64;
} else {
LOG(ERROR) << "unknown type, only support float32 and int64 now.";
}
}
return true; return true;
} }
...@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
new std::string(config_.prog_file)); new std::string(config_.prog_file));
argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
} }
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
PADDLE_ENFORCE( PADDLE_ENFORCE(
...@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>( AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
const contrib::AnalysisConfig& config) {
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
...@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>( ...@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
} }
std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config)); std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) { if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
return nullptr; return nullptr;
} }
return predictor; return predictor;
} }
void AnalysisPredictor::PrepareFeedFetch() {
for (auto *op : inference_program_->Block(0).AllOps()) {
if (op->Type() == "feed") {
int idx = boost::get<int>(op->GetAttr("col"));
if (feeds_.size() <= static_cast<size_t>(idx)) {
feeds_.resize(idx + 1);
}
feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1);
}
fetchs_[idx] = op;
}
}
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
std::unique_ptr<ZeroCopyTensor> res(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = true;
res->SetName(name);
return res;
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
std::unique_ptr<ZeroCopyTensor> res(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = false;
res->SetName(name);
return res;
}
bool AnalysisPredictor::ZeroCopyRun() {
executor_->Run();
return true;
}
bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program
std::unique_ptr<framework::Executor> tmp_exe(
new framework::Executor(platform::CPUPlace()));
if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_ = paddle::inference::Load(
static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s'.", config_.model_dir,
config_.param_file);
return false;
}
return true;
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
auto *x = new AnalysisPredictor(config_);
x->Init(scope_, inference_program_);
return std::unique_ptr<PaddlePredictor>(x);
}
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig& config) { const contrib::AnalysisConfig &config) {
return CreatePaddlePredictor<contrib::AnalysisConfig, return CreatePaddlePredictor<contrib::AnalysisConfig,
PaddleEngineKind::kAnalysis>(config); PaddleEngineKind::kAnalysis>(config);
} }
......
...@@ -12,42 +12,81 @@ ...@@ -12,42 +12,81 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
using inference::analysis::Argument; using inference::analysis::Argument;
using inference::analysis::Analyzer; using inference::analysis::Analyzer;
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
using framework::NaiveExecutor;
using contrib::AnalysisConfig;
/* This predictor is based on the original native predictor with IR and Analysis /* This predictor is based on the original native predictor with IR and Analysis
* support. It will optimize IR and Parameters in the runtime. * support. It will optimize IR and Parameters in the runtime.
* TODO(Superjomn) Replace the Navive predictor? * TODO(Superjomn) Replace the Navive predictor?
*/ */
class AnalysisPredictor : public NativePaddlePredictor { class AnalysisPredictor : public PaddlePredictor {
public: public:
explicit AnalysisPredictor(const contrib::AnalysisConfig& config) explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope); bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
bool Run(const std::vector<PaddleTensor>& inputs, bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor> *output_data,
int batch_size = -1) override { int batch_size = -1) override;
return NativePaddlePredictor::Run(inputs, output_data, batch_size);
} std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override;
std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
const std::string &name) override;
bool ZeroCopyRun() override;
void PrepareFeedFetch();
void OptimizeInferenceProgram(); void OptimizeInferenceProgram();
Argument& analysis_argument() { return argument_; } Argument &analysis_argument() { return argument_; }
std::unique_ptr<PaddlePredictor> Clone() override;
framework::Scope *scope() { return executor_->scope(); }
framework::ProgramDesc &program() { return *inference_program_; }
protected:
bool LoadProgramDesc();
bool SetFeed(const std::vector<PaddleTensor> &input_datas,
framework::Scope *scope);
bool GetFetch(std::vector<PaddleTensor> *output_data,
framework::Scope *scope);
template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data);
private: private:
contrib::AnalysisConfig config_; contrib::AnalysisConfig config_;
Argument argument_; Argument argument_;
std::unique_ptr<NaiveExecutor> executor_;
platform::Place place_;
std::shared_ptr<framework::Scope> scope_;
framework::Scope *sub_scope_{nullptr};
std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
}; };
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(dirname, "", "dirname to tests.");
namespace paddle {
namespace inference {
using contrib::AnalysisConfig;
TEST(AnalysisPredictor, ZeroCopy) {
AnalysisConfig config;
config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
config.use_feed_fetch_ops = false;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
auto w0 = predictor->GetInputTensor("firstw");
auto w1 = predictor->GetInputTensor("secondw");
auto w2 = predictor->GetInputTensor("thirdw");
auto w3 = predictor->GetInputTensor("forthw");
w0->Reshape({4, 1});
w1->Reshape({4, 1});
w2->Reshape({4, 1});
w3->Reshape({4, 1});
auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
for (int i = 0; i < 4; i++) {
w0_data[i] = i;
w1_data[i] = i;
w2_data[i] = i;
w3_data[i] = i;
}
predictor->ZeroCopyRun();
auto out = predictor->GetOutputTensor("fc_1.tmp_2");
PaddlePlace place;
int size = 0;
auto* out_data = out->data<float>(&place, &size);
LOG(INFO) << "output size: " << size / sizeof(float);
LOG(INFO) << "output_data: " << out_data;
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); //
you may not use this file except in compliance with the License. // Licensed under the Apache License, Version 2.0 (the "License");
You may obtain a copy of the License at // you may not use this file except in compliance with the License.
http://www.apache.org/licenses/LICENSE-2.0 // You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software //
distributed under the License is distributed on an "AS IS" BASIS, // http://www.apache.org/licenses/LICENSE-2.0
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
See the License for the specific language governing permissions and // Unless required by applicable law or agreed to in writing, software
limitations under the License. */ // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace paddle { namespace paddle {
...@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { ...@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
} }
} }
PaddleBuf::PaddleBuf(PaddleBuf&& other) PaddleBuf::PaddleBuf(PaddleBuf &&other)
: data_(other.data_), : data_(other.data_),
length_(other.length_), length_(other.length_),
memory_owned_(other.memory_owned_) { memory_owned_(other.memory_owned_) {
...@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) ...@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
other.length_ = 0; other.length_ = 0;
} }
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
if (!other.memory_owned_) { if (!other.memory_owned_) {
data_ = other.data_; data_ = other.data_;
length_ = other.length_; length_ = other.length_;
...@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { ...@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
return *this; return *this;
} }
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
// only the buffer with external memory can be copied // only the buffer with external memory can be copied
data_ = other.data_; data_ = other.data_;
length_ = other.length_; length_ = other.length_;
...@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { ...@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
} }
} }
void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Reset(void *data, size_t length) {
Free(); Free();
memory_owned_ = false; memory_owned_ = false;
data_ = data; data_ = data;
...@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { ...@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
PADDLE_ENFORCE_GT(length_, 0); PADDLE_ENFORCE_GT(length_, 0);
free(static_cast<char*>(data_)); free(static_cast<char *>(data_));
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
} }
......
...@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
VLOG(4) << "Run prepared context"; VLOG(4) << "Run prepared context";
executor_->RunPreparedContext(ctx_.get(), scope, executor_->RunPreparedContext(ctx_.get(), scope,
false, /* don't create local scope each time*/ false, /* don't create local scope each time*/
false /* don't create variable eatch time */); false /* don't create variable each time */);
VLOG(4) << "Finish prepared context"; VLOG(4) << "Finish prepared context";
// get fetch variable // get fetch variable
if (!GetFetch(output_data, scope)) { if (!GetFetch(output_data, scope)) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
...@@ -30,6 +30,8 @@ ...@@ -30,6 +30,8 @@
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ...@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
~NativePaddlePredictor() override; ~NativePaddlePredictor() override;
framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
protected: protected:
bool SetFeed(const std::vector<PaddleTensor> &input_datas, bool SetFeed(const std::vector<PaddleTensor> &input_datas,
framework::Scope *scope); framework::Scope *scope);
......
...@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { ...@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig GetConfig() { NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) { ...@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model"; FLAGS_dirname + "/image_classification_resnet.inference.model";
const bool is_combined = false; const bool is_combined = false;
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes =
...@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model"; FLAGS_dirname + "/image_classification_resnet.inference.model";
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs); std::vector<framework::LoDTensor> jobs(num_jobs);
......
...@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) { bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true; FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
FLAGS_tensorrt_workspace_size = config_.workspace_size;
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else { } else {
...@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void OptimizeInferenceProgram() { void OptimizeInferenceProgram() {
// Analyze inference_program // Analyze inference_program
Argument argument; Argument argument;
argument.Set<int>("minimum_subgraph_size",
new int(config_.minimum_subgraph_size));
argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
argument.Set<int>("workspace_size", new int(config_.workspace_size));
argument.Set<std::string>("precision_mode",
new std::string(config_.precision_mode));
if (!config_.model_dir.empty()) { if (!config_.model_dir.empty()) {
argument.fluid_model_dir.reset(new std::string(config_.model_dir)); argument.fluid_model_dir.reset(new std::string(config_.model_dir));
} else { } else {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
PADDLE_ENFORCE(!name_.empty(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(input_or_output_,
"Can't reshape the output tensor, it is readonly");
PADDLE_ENFORCE(scope_);
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(shape));
}
template <typename T>
T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
switch (static_cast<int>(place)) {
case static_cast<int>(PaddlePlace::kCPU): {
return tensor->mutable_data<T>(platform::CPUPlace());
}
case static_cast<int>(PaddlePlace::kGPU): {
return tensor->mutable_data<T>(platform::CUDAPlace());
}
default:
PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
break;
}
return nullptr;
}
template <typename T>
T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
auto *res = tensor->data<T>();
if (platform::is_cpu_place(tensor->place())) {
*place = PaddlePlace::kCPU;
} else if (platform::is_gpu_place(tensor->place())) {
*place = PaddlePlace::kGPU;
} else {
*place = PaddlePlace::kUNK;
}
*size = tensor->numel();
return res;
}
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(scope_);
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
auto *tensor = var->GetMutable<framework::LoDTensor>();
return tensor;
}
std::vector<int64_t> ZeroCopyTensor::shape() {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
return framework::vectorize(tensor->dims());
}
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
framework::LoD lod;
for (auto &level : x) {
lod.emplace_back(level);
}
tensor->set_lod(lod);
}
std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
std::vector<std::vector<size_t>> res;
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
for (auto &level : tensor->lod()) {
res.emplace_back(level);
}
return res;
}
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
template <typename T>
T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
return nullptr;
}
template <typename T>
T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
return nullptr;
}
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { return nullptr; }
std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
return std::vector<std::vector<size_t>>();
}
} // namespace paddle
...@@ -21,8 +21,10 @@ ...@@ -21,8 +21,10 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
} }
template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) {
int size{0};
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
int c = 0;
for (const auto &f : data) {
for (T v : f) {
ptr[c++] = v;
}
}
return size;
}
static std::string DescribeTensor(const PaddleTensor &tensor) { static std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os; std::stringstream os;
os << "Tensor [" << tensor.name << "]\n"; os << "Tensor [" << tensor.name << "]\n";
...@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, ...@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
} }
} }
template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
std::stringstream ss;
ss << "\n---- tensor ---" << '\n';
ss << "lod: [";
for (const auto &level : tensor.lod()) {
ss << "[ ";
for (auto i : level) {
ss << i << ", ";
}
ss << "]";
}
ss << "]\n";
ss << "shape: [";
int size = 1;
for (int i = 0; i < tensor.dims().size(); i++) {
int dim = tensor.dims()[i];
ss << dim << ", ";
size *= dim;
}
ss << "]\n";
ss << "data: ";
for (int i = 0; i < std::min(20, size); i++) {
ss << tensor.data<T>()[i] << " ";
}
ss << "\n";
return ss.str();
}
static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
if (a.size() != b.size()) {
LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
b.size());
return false;
}
for (size_t i = 0; i < a.size(); i++) {
auto &al = a[i];
auto &bl = b[i];
if (al.size() != bl.size()) {
LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
bl.size());
return false;
}
}
return true;
}
static bool CompareShape(const std::vector<int64_t> &a,
const std::vector<int64_t> &b) {
if (a.size() != b.size()) {
LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
b.size());
return false;
}
for (size_t i = 0; i < a.size(); i++) {
if (a[i] != b[i]) {
LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
a[i], b[i]);
return false;
}
}
return true;
}
static bool CompareTensorData(const framework::LoDTensor &a,
const framework::LoDTensor &b) {
auto a_shape = framework::vectorize(a.dims());
auto b_shape = framework::vectorize(b.dims());
size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
[](int a, int b) { return a * b; });
size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
[](int a, int b) { return a * b; });
if (a_size != b_size) {
LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
a_size, b_size);
}
for (size_t i = 0; i < a_size; i++) {
if (a.type() == typeid(float)) {
const auto *a_data = a.data<float>();
const auto *b_data = b.data<float>();
if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
LOG(ERROR) << string::Sprintf(
"tensor data %d-th element not match, %f != %f", i, a_data[i],
b_data[i]);
return false;
}
} else if (a.type() == typeid(int64_t)) {
const auto *a_data = a.data<int64_t>();
const auto *b_data = b.data<int64_t>();
if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
LOG(ERROR) << string::Sprintf(
"tensor data %d-th element not match, %f != %f", i, a_data[i],
b_data[i]);
return false;
}
}
}
return true;
}
static bool CompareTensor(const framework::LoDTensor &a,
const framework::LoDTensor &b) {
if (!CompareLoD(a.lod(), b.lod())) {
return false;
}
if (!CompareShape(framework::vectorize(a.dims()),
framework::vectorize(b.dims()))) {
return false;
}
if (!CompareTensorData(a, b)) {
return false;
}
return true;
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -101,6 +101,40 @@ struct PaddleTensor { ...@@ -101,6 +101,40 @@ struct PaddleTensor {
std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
// Tensor without copy, currently only supports AnalysisPredictor.
class ZeroCopyTensor {
public:
void Reshape(const std::vector<int>& shape);
// Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size.
// Once can directly call this data to feed the data.
// This is for write the input tensor.
template <typename T>
T* mutable_data(PaddlePlace place);
// Get the memory directly, will return the place and memory size by pointer.
// This is for reading the output tensor.
template <typename T>
T* data(PaddlePlace* place, int* size);
std::vector<int64_t> shape();
void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const;
protected:
ZeroCopyTensor(void* scope) : scope_{scope} {}
void SetName(const std::string& name) { name_ = name; }
void* FindTensor() const;
private:
std::string name_;
bool input_or_output_;
friend class AnalysisPredictor;
void* scope_{nullptr};
};
/* /*
* A simple Inference API for Paddle. * A simple Inference API for Paddle.
*/ */
...@@ -120,6 +154,19 @@ class PaddlePredictor { ...@@ -120,6 +154,19 @@ class PaddlePredictor {
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0; int batch_size = -1) = 0;
// Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly,
// without copy.
virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string& name) {
return nullptr;
}
virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
const std::string& name) {
return nullptr;
}
virtual bool ZeroCopyRun() { return false; }
// Clone a predictor that share the model weights, the Cloned predictor should // Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe. // be thread-safe.
virtual std::unique_ptr<PaddlePredictor> Clone() = 0; virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
...@@ -194,6 +241,14 @@ struct MixedRTConfig : public NativeConfig { ...@@ -194,6 +241,14 @@ struct MixedRTConfig : public NativeConfig {
// For workspace_size, refer it from here: // For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int workspace_size{1 << 30}; int workspace_size{1 << 30};
// We transform the Ops that can be converted into TRT layer in the model,
// and aggregate these Ops into subgraphs for TRT execution.
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int minimum_subgraph_size = 3;
// Reserved configuration
// We just support "FP32" now, "FP16" and "INT8" will be supported.
std::string precision_mode = "FP32";
}; };
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
...@@ -204,12 +259,18 @@ struct AnalysisConfig : public NativeConfig { ...@@ -204,12 +259,18 @@ struct AnalysisConfig : public NativeConfig {
kExclude // Specify the disabled passes in `ir_passes`. kExclude // Specify the disabled passes in `ir_passes`.
}; };
// Determine whether to perform graph optimization.
bool enable_ir_optim = true; bool enable_ir_optim = true;
// Manually determine the IR passes to run.
IrPassMode ir_mode{IrPassMode::kExclude}; IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default. std::vector<std::string> ir_passes;
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
// NOT stable yet.
bool use_feed_fetch_ops{true};
// NOTE this is just for internal development, please not use it. // NOTE this is just for internal development, please not use it. NOT
// stable
// yet.
bool _use_mkldnn{false}; bool _use_mkldnn{false};
}; };
......
...@@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI ...@@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
DEPS inference_anakin_api_shared dynload_cuda SERIAL) DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif() endif()
endif() endif()
if(WITH_GPU AND TENSORRT_FOUND)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
cc_test(test_trt_models SRCS trt_models_tester.cc
ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
DEPS paddle_inference_tensorrt_subgraph_engine)
endif()
...@@ -18,6 +18,8 @@ namespace paddle { ...@@ -18,6 +18,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<int64_t> data; std::vector<int64_t> data;
std::vector<size_t> lod; std::vector<size_t> lod;
...@@ -78,6 +80,7 @@ struct DataRecord { ...@@ -78,6 +80,7 @@ struct DataRecord {
} }
} }
} }
DataRecord NextBatch() { DataRecord NextBatch() {
DataRecord data; DataRecord data;
data.data = batched_datas[batch_iter]; data.data = batched_datas[batch_iter];
...@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { ...@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> word_data_all, mention_data_all; std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
...@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { ...@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
......
...@@ -12,12 +12,16 @@ ...@@ -12,12 +12,16 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_bool(with_precision_check, true, "turn on test");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using namespace framework; // NOLINT using namespace framework; // NOLINT
using namespace contrib; // NOLINT
struct DataRecord { struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all; std::vector<std::vector<std::vector<float>>> link_step_data_all;
...@@ -29,10 +33,12 @@ struct DataRecord { ...@@ -29,10 +33,12 @@ struct DataRecord {
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1) explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) { : batch_size(batch_size) {
Load(path); Load(path);
} }
DataRecord NextBatch() { DataRecord NextBatch() {
DataRecord data; DataRecord data;
size_t batch_end = batch_iter + batch_size; size_t batch_end = batch_iter + batch_size;
...@@ -101,6 +107,7 @@ struct DataRecord { ...@@ -101,6 +107,7 @@ struct DataRecord {
num_samples = num_lines; num_samples = num_lines;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) { int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
...@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
ZeroCopyTensor *cell_init_tensor,
ZeroCopyTensor *data_tensor,
ZeroCopyTensor *hidden_init_tensor,
ZeroCopyTensor *week_tensor,
ZeroCopyTensor *minute_tensor,
DataRecord *data_record, int batch_size) {
auto one_batch = data_record->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor->Reshape({1, 2});
lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
cell_init_tensor->Reshape({batch_size, 15});
cell_init_tensor->SetLoD({one_batch.lod3});
hidden_init_tensor->Reshape({batch_size, 15});
hidden_init_tensor->SetLoD({one_batch.lod3});
data_tensor->Reshape(rnn_link_data_shape);
data_tensor->SetLoD({one_batch.lod1});
week_tensor->Reshape(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor->SetLoD({one_batch.lod3});
minute_tensor->Reshape(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor->SetLoD({one_batch.lod3});
// assign data
float arr0[] = {0, 0};
std::vector<float> zeros(batch_size * 15, 0);
std::copy_n(arr0, 2,
lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
std::copy_n(zeros.begin(), zeros.size(),
cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
std::copy_n(zeros.begin(), zeros.size(),
hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
}
void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param"; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false; cfg->use_gpu = false;
...@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) { ...@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
...@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
}
bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope,
const std::vector<std::string> &tensors) {
for (auto &x : tensors) {
auto *a_var = a_scope.FindVar(x);
auto *b_var = b_scope.FindVar(x);
if (a_var && b_var) {
if (a_var->Type() == typeid(framework::LoDTensor) ||
a_var->Type() == typeid(framework::Tensor)) {
LOG(INFO) << "comparing tensor " << x;
auto &a_t = a_var->Get<framework::LoDTensor>();
auto &b_t = b_var->Get<framework::LoDTensor>();
if (!inference::CompareTensor(a_t, b_t)) {
LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
}
} else {
LOG(INFO) << "skip no tensor " << x;
}
} else {
LOG(INFO) << "skip tensor " << x;
}
}
return true;
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// on the complex RNN1 model.
TEST(Analyzer_rnn1, ZeroCopy) {
AnalysisConfig config;
SetConfig(&config);
config.use_feed_fetch_ops = false;
PaddlePlace place;
int output_size{0};
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
config.use_feed_fetch_ops = true;
auto native_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch.
auto analysis_predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
data_tensor.get(), hidden_init_tensor.get(),
week_tensor.get(), minute_tensor.get(), &data,
FLAGS_batch_size);
// Prepare data for NativePredictor
std::vector<std::vector<PaddleTensor>> native_inputs;
SetInput(&native_inputs);
std::vector<PaddleTensor> native_outputs;
std::vector<PaddleTensor> analysis_outputs;
auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
// Run analysis predictor
int num_ops;
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
ASSERT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
Timer timer;
double total_time{0};
double native_total_time{0};
double analysis_total_time{0.};
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
auto *output_data = output_tensor->data<float>(&place, &output_size);
ASSERT_GT(output_size, 0); // more than one output!
for (int i = 0; i < FLAGS_repeat; i++) {
// Run native predictor.
timer.tic();
ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
native_total_time += timer.toc();
}
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
ASSERT_TRUE(
analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
analysis_total_time += timer.toc();
}
if (!FLAGS_with_precision_check) {
return;
}
int native_output_size = VecReduceToInt(native_outputs.front().shape);
EXPECT_EQ(native_output_size, output_size);
// Compare tensors between analysis and zerocopy
auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
std::vector<std::string> tensor_names;
for (auto &var_desc : p0->program().Block(0).AllVars()) {
tensor_names.push_back(var_desc->Name());
}
LOG(INFO) << "Comparing tensors";
ASSERT_TRUE(
CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
ASSERT_TRUE(
CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
p0->scope()
->FindVar("final_output.tmp_1")
->Get<framework::LoDTensor>());
LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
p1->scope()
->FindVar("final_output.tmp_1")
->Get<framework::LoDTensor>());
LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
p2->scope()
->FindVar("final_output.tmp_1")
->Get<framework::LoDTensor>());
for (int i = 0; i < output_size; i++) {
LOG(INFO) << output_data[i] << " "
<< static_cast<float *>(native_outputs.front().data.data())[i]
<< " "
<< static_cast<float *>(analysis_outputs.front().data.data())[i];
EXPECT_NEAR(output_data[i],
static_cast<float *>(native_outputs.front().data.data())[i],
1e-3);
}
LOG(INFO) << "batch_size: " << FLAGS_batch_size;
LOG(INFO) << "zero average time: "
<< total_time / (FLAGS_repeat * FLAGS_batch_size);
LOG(INFO) << "analysis average time: "
<< analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
LOG(INFO) << "native average time: "
<< native_total_time / (FLAGS_repeat * FLAGS_batch_size);
}
TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config;
SetConfig(&config);
config.use_feed_fetch_ops = false;
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
double total_time_of_threads{0};
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
}
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
// auto predictor = base_predictor->Clone();
auto &predictor = predictors[tid];
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
cell_init_tensor.get(), data_tensor.get(),
hidden_init_tensor.get(), week_tensor.get(),
minute_tensor.get(), &data, FLAGS_batch_size);
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: "
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
} }
} // namespace inference } // namespace inference
......
...@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) { ...@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
GetFuseStatis(predictor.get(), &num_ops);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using contrib::AnalysisConfig;
struct Record { struct Record {
std::vector<float> data; std::vector<float> data;
...@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) { ...@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
GetFuseStatis(predictor.get(), &num_ops);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor( ...@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config, std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
int *num_ops) { int *num_ops) {
auto predictor = CreateTestPredictor(config); auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument() auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>( .Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr); framework::ir::kFuseStatisAttr);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
using paddle::contrib::MixedRTConfig;
DEFINE_string(dirname, "", "Directory of the inference model.");
NativeConfig GetConfigNative() {
NativeConfig config;
config.model_dir = FLAGS_dirname;
// LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.45;
config.use_gpu = true;
config.device = 0;
return config;
}
MixedRTConfig GetConfigTRT() {
MixedRTConfig config;
config.model_dir = FLAGS_dirname;
config.use_gpu = true;
config.fraction_of_gpu_memory = 0.2;
config.device = 0;
config.max_batch_size = 3;
return config;
}
void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
NativeConfig config0 = GetConfigNative();
config0.model_dir = model_dirname;
MixedRTConfig config1 = GetConfigTRT();
config1.model_dir = model_dirname;
config1.max_batch_size = batch_size;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1);
// Prepare inputs
int height = 224;
int width = 224;
float *data = new float[batch_size * 3 * height * width];
memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
data[0] = 1.0f;
// Prepare inputs
PaddleTensor tensor;
tensor.name = "input_0";
tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data = PaddleBuf(static_cast<void *>(data),
sizeof(float) * (batch_size * 3 * height * width));
tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
// Prepare outputs
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
// Get output.
ASSERT_EQ(outputs0.size(), 1UL);
ASSERT_EQ(outputs1.size(), 1UL);
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
}
}
TEST(trt_models_test, main) {
std::vector<std::string> infer_models = {"mobilenet", "resnet50",
"resnext50"};
for (auto &model_dir : infer_models) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
}
}
} // namespace paddle
...@@ -36,6 +36,8 @@ namespace memory { ...@@ -36,6 +36,8 @@ namespace memory {
using BuddyAllocator = detail::BuddyAllocator; using BuddyAllocator = detail::BuddyAllocator;
BuddyAllocator* GetCPUBuddyAllocator() { BuddyAllocator* GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
static std::once_flag init_flag; static std::once_flag init_flag;
static detail::BuddyAllocator* a = nullptr; static detail::BuddyAllocator* a = nullptr;
...@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { ...@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() {
return a; return a;
} }
// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
// seems they are almost the same overhead.
struct NaiveAllocator {
void* Alloc(size_t size) { return malloc(size); }
void Free(void* p) {
PADDLE_ENFORCE(p);
free(p);
}
static NaiveAllocator* Instance() {
static NaiveAllocator x;
return &x;
}
private:
std::mutex lock_;
};
template <> template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
......
...@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
"Out and Label should have same height."); "Out and Label should have same height.");
int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1; int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
int slide_steps = ctx->Attrs().Get<int>("slide_steps");
PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
ctx->SetOutputDim("AUC", {1}); ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("BatchAUC", {1});
ctx->SetOutputDim("StatPosOut", {num_pred_buckets}); slide_steps = slide_steps == 0 ? 1 : slide_steps;
ctx->SetOutputDim("StatNegOut", {num_pred_buckets}); ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
} }
protected: protected:
...@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data. " "A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]"); "shape: [batch_size, 1]");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddInput("StatPos", "Statistic value when label = 1"); AddInput("StatPos", "Statistic value when label = 1");
AddInput("StatNeg", "Statistic value when label = 0"); AddInput("StatNeg", "Statistic value when label = 0");
...@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-the-curve."); "current area-under-the-curve.");
AddOutput("BatchAUC", "The AUC for current batch");
AddOutput("StatPosOut", "Statistic value when label = 1"); AddOutput("StatPosOut", "Statistic value when label = 1");
AddOutput("StatNegOut", "Statistic value when label = 0"); AddOutput("StatNegOut", "Statistic value when label = 0");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
AddAttr<int>("num_thresholds", AddAttr<int>(
"The number of thresholds to use when discretizing the" "num_thresholds",
" roc curve.") "The number of thresholds to use when discretizing the roc curve.")
.SetDefault((2 << 12) - 1); .SetDefault((2 << 12) - 1);
AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
.SetDefault(1);
AddComment(R"DOC( AddComment(R"DOC(
Area Under The Curve (AUC) Operator. Area Under The Curve (AUC) Operator.
......
...@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {
std::string curve = ctx.Attr<std::string>("curve"); std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds"); int num_thresholds = ctx.Attr<int>("num_thresholds");
// buckets contain numbers from 0 to num_thresholds
int num_pred_buckets = num_thresholds + 1; int num_pred_buckets = num_thresholds + 1;
int slide_steps = ctx.Attr<int>("slide_steps");
// Only use output var for now, make sure it's persistable and // Only use output var for now, make sure it's persistable and
// not cleaned up for each batch. // not cleaned up for each batch.
...@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
auto *stat_pos = ctx.Output<Tensor>("StatPosOut"); auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
auto *stat_neg = ctx.Output<Tensor>("StatNegOut"); auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace()); auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace()); auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
auc);
auto *batch_auc = ctx.Output<Tensor>("BatchAUC"); std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0); std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(), auto stat_pos_calc = stat_pos_data.data();
num_thresholds, batch_auc); auto stat_neg_calc = stat_neg_data.data();
statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
} }
private: private:
...@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
} }
inline static void calcAuc(const framework::ExecutionContext &ctx, inline static void statAuc(const framework::Tensor *label,
const framework::Tensor *label,
const framework::Tensor *predict, const framework::Tensor *predict,
int64_t *stat_pos, int64_t *stat_neg, const int num_pred_buckets,
int num_thresholds, const int num_thresholds, const int slide_steps,
framework::Tensor *auc_tensor) { int64_t *origin_stat_pos, int64_t *origin_stat_neg,
int64_t **stat_pos, int64_t **stat_neg) {
size_t batch_size = predict->dims()[0]; size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1]; size_t inference_width = predict->dims()[1];
const T *inference_data = predict->data<T>(); const T *inference_data = predict->data<T>();
const auto *label_data = label->data<int64_t>(); const auto *label_data = label->data<int64_t>();
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
for (size_t i = 0; i < batch_size; i++) { for (size_t i = 0; i < batch_size; i++) {
uint32_t binIdx = static_cast<uint32_t>( uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds); inference_data[i * inference_width + 1] * num_thresholds);
if (label_data[i]) { if (label_data[i]) {
stat_pos[binIdx] += 1.0; (*stat_pos)[binIdx] += 1.0;
} else { } else {
stat_neg[binIdx] += 1.0; (*stat_neg)[binIdx] += 1.0;
} }
} }
int bucket_length = num_pred_buckets * sizeof(int64_t);
// will stat auc unlimited.
if (slide_steps == 0) {
for (int slide = 0; slide < num_pred_buckets; ++slide) {
origin_stat_pos[slide] += (*stat_pos)[slide];
origin_stat_neg[slide] += (*stat_neg)[slide];
}
*stat_pos = origin_stat_pos;
*stat_neg = origin_stat_neg;
} else {
for (int slide = 1; slide < slide_steps; ++slide) {
int dst_idx = (slide - 1) * num_pred_buckets;
int src_inx = slide * num_pred_buckets;
std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
bucket_length);
std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
bucket_length);
}
std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
*stat_pos, bucket_length);
std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
*stat_neg, bucket_length);
std::memset(*stat_pos, 0, bucket_length);
std::memset(*stat_neg, 0, bucket_length);
for (int slide = 0; slide < num_pred_buckets; ++slide) {
int stat_pos_steps = 0;
int stat_neg_steps = 0;
for (int step = 0; step < slide_steps; ++step) {
stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
}
(*stat_pos)[slide] += stat_pos_steps;
(*stat_neg)[slide] += stat_neg_steps;
}
}
}
inline static void calcAuc(const framework::ExecutionContext &ctx,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
*auc = 0.0f; *auc = 0.0f;
double totPos = 0.0; double totPos = 0.0;
...@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
totPos += stat_pos[idx]; totPos += stat_pos[idx];
totNeg += stat_neg[idx]; totNeg += stat_neg[idx];
*auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
--idx; --idx;
} }
......
...@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc ...@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu) polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
endif()
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
#Export local libraries to parent #Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { ...@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()), framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
platform::CPUPlace()); ctx.device_context());
} }
}; };
...@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, ...@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
const T *im_info_data = im_info.data<T>(); const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace()); T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
T im_scale = im_info_data[2]; T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1}); keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f); min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace()); int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
...@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("post_nms_topN", "post_nms_topN"); AddAttr<int>("post_nms_topN", "post_nms_topN");
AddAttr<float>("nms_thresh", "nms_thres"); AddAttr<float>("nms_thresh", "nms_thres");
AddAttr<float>("min_size", "min size"); AddAttr<float>("min_size", "min size");
AddAttr<float>("eta", "eta"); AddAttr<float>("eta", "The parameter for adaptive NMS.");
AddComment(R"DOC( AddComment(R"DOC(
Generate Proposals OP Generate Proposals OP
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdio.h>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
namespace {
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const kThreadsPerBlock = sizeof(uint64_t) * 8;
template <typename T>
__global__ void RangeInitKernel(const T start, const T delta, const int size,
T *out) {
CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
}
template <typename T>
void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
Tensor *value_out, Tensor *index_out) {
int num = value.numel();
Tensor index_in_t;
int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream();
RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
const T *keys_in = value.data<T>();
T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
memory::Free(place, d_temp_storage);
}
template <typename T>
__device__ __forceinline__ T Min(T x, T y) {
return x < y ? x : y;
}
template <typename T>
__device__ __forceinline__ T Max(T x, T y) {
return x > y ? x : y;
}
template <typename T>
__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
const T *var, const int *index,
const T *im_info, const int num,
T *proposals) {
T kBBoxClipDefault = log(1000.0 / 16.0);
CUDA_1D_KERNEL_LOOP(i, num) {
int k = index[i] * 4;
T axmin = anchor[k];
T aymin = anchor[k + 1];
T axmax = anchor[k + 2];
T aymax = anchor[k + 3];
T w = axmax - axmin + 1.0;
T h = aymax - aymin + 1.0;
T cx = axmin + 0.5 * w;
T cy = aymin + 0.5 * h;
T dxmin = deltas[k];
T dymin = deltas[k + 1];
T dxmax = deltas[k + 2];
T dymax = deltas[k + 3];
T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
if (var) {
d_cx = cx + dxmin * w * var[k];
d_cy = cy + dymin * h * var[k + 1];
d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
} else {
d_cx = cx + dxmin * w;
d_cy = cy + dymin * h;
d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
}
T oxmin = d_cx - d_w * 0.5;
T oymin = d_cy - d_h * 0.5;
T oxmax = d_cx + d_w * 0.5 - 1.;
T oymax = d_cy + d_h * 0.5 - 1.;
proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
}
}
template <typename T, int BlockSize>
__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
const T min_size, const int num, int *keep_num,
int *keep) {
T im_h = im_info[0];
T im_w = im_info[1];
T im_scale = im_info[2];
int cnt = 0;
__shared__ int keep_index[BlockSize];
CUDA_1D_KERNEL_LOOP(i, num) {
keep_index[threadIdx.x] = -1;
__syncthreads();
int k = i * 4;
T xmin = bboxes[k];
T ymin = bboxes[k + 1];
T xmax = bboxes[k + 2];
T ymax = bboxes[k + 3];
T w = xmax - xmin + 1.0;
T h = ymax - ymin + 1.0;
T cx = xmin + w / 2.;
T cy = ymin + h / 2.;
T w_s = (xmax - xmin) / im_scale + 1.;
T h_s = (ymax - ymin) / im_scale + 1.;
if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
keep_index[threadIdx.x] = i;
}
__syncthreads();
if (threadIdx.x == 0) {
int size = (num - i) < BlockSize ? num - i : BlockSize;
for (int j = 0; j < size; ++j) {
if (keep_index[j] > -1) {
keep[cnt++] = keep_index[j];
}
}
}
__syncthreads();
}
if (threadIdx.x == 0) {
keep_num[0] = cnt;
}
}
__device__ inline float IoU(const float *a, const float *b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float inter_s = width * height;
float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return inter_s / (s_a + s_b - inter_s);
}
__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, uint64_t *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
const int row_size =
min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
const int col_size =
min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
__shared__ float block_boxes[kThreadsPerBlock * 4];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 4 + 0] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
block_boxes[threadIdx.x * 4 + 1] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
block_boxes[threadIdx.x * 4 + 2] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
block_boxes[threadIdx.x * 4 + 3] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
uint64_t t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
template <typename T>
void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const Tensor &sorted_indices, const T nms_threshold,
Tensor *keep_out) {
int boxes_num = proposals.dims()[0];
PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
DIVUP(boxes_num, kThreadsPerBlock));
dim3 threads(kThreadsPerBlock);
const T *boxes = proposals.data<T>();
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
uint64_t *d_mask =
reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
uint64_t *h_mask = reinterpret_cast<uint64_t *>(
memory::Alloc(platform::CPUPlace(), size_bytes));
memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<int> keep_vec;
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / kThreadsPerBlock;
int inblock = i % kThreadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
++num_to_keep;
keep_vec.push_back(i);
uint64_t *p = &h_mask[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
sizeof(int) * num_to_keep, 0);
memory::Free(place, d_mask);
memory::Free(platform::CPUPlace(), h_mask);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const platform::CUDADeviceContext &ctx, const Tensor &im_info,
const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas, // [M, 4]
const Tensor &scores, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
// 1. pre nms
Tensor scores_sort, index_sort;
SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
int num = scores.numel();
int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
: pre_nms_top_n;
scores_sort.Resize({pre_nms_num, 1});
index_sort.Resize({pre_nms_num, 1});
// 2. box decode and clipping
Tensor proposals;
proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream();
BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
proposals.data<T>());
// 3. filter
Tensor keep_index, keep_num_t;
keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
min_size = std::max(min_size, 1.0f);
FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
keep_num_t.data<int>(), keep_index.data<int>());
int keep_num;
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
keep_num_t.data<int>(), sizeof(int), 0);
keep_index.Resize({keep_num});
Tensor scores_filter, proposals_filter;
proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
if (nms_thresh <= 0) {
return std::make_pair(proposals_filter, scores_filter);
}
// 4. nms
Tensor keep_nms;
NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
Tensor scores_nms, proposals_nms;
proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
return std::make_pair(proposals_nms, scores_nms);
}
} // namespace
template <typename DeviceContext, typename T>
class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *scores = context.Input<Tensor>("Scores");
auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
auto *im_info = context.Input<Tensor>("ImInfo");
auto *anchors = context.Input<Tensor>("Anchors");
auto *variances = context.Input<Tensor>("Variances");
auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
int post_nms_top_n = context.Attr<int>("post_nms_topN");
float nms_thresh = context.Attr<float>("nms_thresh");
float min_size = context.Attr<float>("min_size");
float eta = context.Attr<float>("eta");
PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
auto &dev_ctx = context.template device_context<DeviceContext>();
auto scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
dev_ctx.GetPlace());
scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
dev_ctx.GetPlace());
math::Transpose<DeviceContext, T, 4> trans;
std::vector<int> axis = {0, 2, 3, 1};
trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
trans(dev_ctx, *scores, &scores_swap, axis);
Tensor *anchor = const_cast<framework::Tensor *>(anchors);
anchor->Resize({anchors->numel() / 4, 4});
Tensor *var = const_cast<framework::Tensor *>(variances);
var->Resize({var->numel() / 4, 4});
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
T *rpn_rois_data = rpn_rois->data<T>();
T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
int64_t num_proposals = 0;
std::vector<size_t> offset(1, 0);
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> box_score_pair =
ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
bbox_deltas_slice, scores_slice, pre_nms_top_n,
post_nms_top_n, nms_thresh, min_size, eta);
Tensor proposals = box_score_pair.first;
Tensor scores = box_score_pair.second;
memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
scores.data<T>(), sizeof(T) * scores.numel(), 0);
num_proposals += proposals.dims()[0];
offset.emplace_back(num_proposals);
}
framework::LoD lod;
lod.emplace_back(offset);
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(generate_proposals,
ops::CUDAGenerateProposalsKernel<
paddle::platform::CUDADeviceContext, float>);
...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto label_lod = in_label->lod(); auto& label_lod = in_label->lod();
auto detect_lod = in_detect->lod(); auto& detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto label_lod = input_label.lod(); auto& label_lod = input_label.lod();
auto detect_lod = input_detect.lod(); auto& detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto label_index = label_lod[0]; auto& label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
...@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
...@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod()[0]; auto& pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
...@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto image_gt_boxes = gt_boxes[n]; auto& image_gt_boxes = gt_boxes[n];
for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { for (auto& image_gt_box : image_gt_boxes) {
size_t count = 0; size_t count = 0;
auto labeled_bboxes = it->second; auto& labeled_bboxes = image_gt_box.second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (size_t i = 0; i < labeled_bboxes.size(); ++i) for (auto& box : labeled_bboxes) {
if (!(labeled_bboxes[i].is_difficult)) ++count; if (!box.is_difficult) {
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = it->first; int label = image_gt_box.first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {
......
...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { ...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto in_rows = in.rows(); auto &in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
......
...@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE_EQ( if (ctx->Attrs().Get<bool>("use_peepholes")) {
b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size, PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be " "The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection or" "7 * %d if enable peepholes connection",
"4 * %d if disable peepholes", frame_size);
frame_size, frame_size); ctx->SetOutputDim("CheckedCell", {2, frame_size});
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes",
frame_size);
}
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
...@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() { ...@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
.AsIntermediate();
AddAttr<bool>("use_peepholes", AddAttr<bool>("use_peepholes",
"(bool, defalut: True) " "(bool, defalut: True) "
"whether to enable diagonal/peephole connections.") "whether to enable diagonal/peephole connections.")
...@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const int D3 = D * 3; \ const int D3 = D * 3; \
const int D4 = wh_dims[1]; const int D4 = wh_dims[1];
#define INIT_BASE_INPUT_DATAS \ #define INIT_BASE_INPUT_DATAS \
const T* x_data = x->data<T>(); \ const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \ const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \ const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \ /* diagonal weight*/ \
const T* wc_data = bias->data<T>() + D4; \ const T* wc_data = bias->data<T>() + D4; \
/* for peephole only*/ \ /* for peephole only*/ \
Tensor checked_cell; \ T* checked_cell_data = nullptr; \
T* checked_cell_data = nullptr; \ auto place = ctx.GetPlace(); \
auto place = ctx.GetPlace(); \ if (use_peepholes) { \
if (use_peepholes) { \ /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \ checked_cell_data = checked_cell->mutable_data<T>(place); \
} }
/// Compute LSTM /// Compute LSTM
......
...@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here. // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(), memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place, gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
......
...@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(out_place), out_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data, in1_value.numel() * sizeof(T), context.stream());
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
...@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
......
...@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
SamplingId Operator. SamplingId Operator.
A layer for sampling id from multinomial distribution from the A layer for sampling id from multinomial distribution from the
input. Sampling one id for one sample.)DOC"); input. Sampling one id for one sample.)DOC");
AddAttr<float>("min", "Minimum value of random. [default 0.0].") AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("max", "Maximun value of random. [default 1.0].") AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
.SetDefault(1.0f); .SetDefault(1.0f);
AddAttr<int>("seed", AddAttr<int>(
"Random seed used for the random number engine. " "seed",
"0 means use a seed generated by the system." "Random seed used for the random number engine. "
"Note that if seed is not 0, this operator will always " "0 means use a seed generated by the system."
"generate the same random numbers every time. [default 0].") "Note that if seed is not 0, this operator will always "
"generate the same random numbers every time. (int, default 0).")
.SetDefault(0); .SetDefault(0);
} }
}; };
......
...@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference { ...@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name); auto *out_var = block->FindVarRecursive(out_var_name);
out_var->SetType(in_var.GetType()); if (in_var_name != out_var_name) {
out_var->SetDataType(in_var.GetDataType()); out_var->SetType(in_var.GetType());
out_var->SetDataType(in_var.GetDataType());
}
} }
}; };
......
...@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
} }
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_LT(0, offset_data[i], PADDLE_ENFORCE_LE(0, offset_data[i],
"The offset[%d] must greater than zero.", i); "The offset[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(0, length_data[i], PADDLE_ENFORCE_LT(0, length_data[i],
"The length[%d] must greater than zero.", i); "The length[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
lod[0][i + 1], "The target tensor's length overflow."); lod[0][i + 1], "The target tensor's length overflow.");
} }
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #include <algorithm>
#include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/operators/sgd_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, ...@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
} }
} }
template <typename T, int block_size> template <typename T>
__global__ void SparseSGDFunctorKernel(const T* selected_rows, __global__ void SparseSGDFunctorKernel(const T* selected_rows,
const int64_t* rows, const int64_t* rows,
const T* learning_rate, T* tensor_out, const T* learning_rate, T* tensor_out,
int64_t row_numel) { int64_t row_numel, int64_t limit) {
const int ty = blockIdx.y; for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
int tid = threadIdx.x; const T* selected_rows_ptr = selected_rows + i * row_numel;
T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
selected_rows += ty * row_numel; for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
tensor_out += rows[ty] * row_numel; // Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
for (int index = tid; index < row_numel; index += block_size) { paddle::platform::CudaAtomicAdd(
// Since index in rows of SelectedRows can be duplicate, we have to use tensor_out_ptr + index,
// Atomic Operation to avoid concurrent write error. -1.0 * learning_rate[0] * selected_rows_ptr[index]);
paddle::platform::CudaAtomicAdd( }
tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
} }
} }
} // namespace } // namespace
...@@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value(); auto& in_value = grad->value();
framework::Vector<int64_t> in_rows(grad->rows()); auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
...@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
auto* in_data = in_value.data<T>(); auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>(); auto* out_data = param_out->data<T>();
const int block_size = 256; const int kThreadsPerBlock = 256;
dim3 threads(block_size, 1); int thread_x = kThreadsPerBlock;
dim3 grid(1, in_rows.size()); int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
SparseSGDFunctorKernel< int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
ctx.cuda_device_context().stream()>>>(
in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(), in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
out_data, in_row_numel); out_data, in_row_numel, in_rows.size());
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");
......
...@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X"); auto in_vars = context.MultiInputVar("X");
int N = in_vars.size(); size_t in_num = in_vars.size();
auto out_var = context.OutputVar("Out"); auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0]; bool in_place = out_var == in_vars[0];
...@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
auto &place = auto &place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
// If in_place, just skip the first tensor // If in_place, just skip the first tensor
for (int i = in_place ? 1 : 0; i < N; i++) { for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) { if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>(); auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) { if (in_t.numel() == 0) {
...@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape // Runtime InferShape
size_t first_dim = 0; size_t first_dim = 0;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size(); first_dim += sel_row.rows().size();
} }
std::vector<int64_t> in_dim; std::vector<int64_t> in_dim;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() > 0) { if (sel_row.rows().size() > 0) {
in_dim = framework::vectorize(sel_row.value().dims()); in_dim = framework::vectorize(sel_row.value().dims());
...@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
} }
if (in_dim.empty()) { if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty"; VLOG(3) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); in_dim =
framework::vectorize(get_selected_row(in_num - 1).value().dims());
} else { } else {
in_dim[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
} }
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
...@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
math::SelectedRowsAddTo<DeviceContext, T> functor; math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() == 0) { if (sel_row.rows().size() == 0) {
continue; continue;
......
...@@ -22,8 +22,6 @@ ...@@ -22,8 +22,6 @@
namespace paddle { namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
namespace operators { namespace operators {
...@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Ys", "A list of outputs").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph."); AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine."); AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
AddAttr<int>("max_batch_size", "the maximum batch size.");
AddAttr<int>("workspace_size", "the workspace size.");
AddComment("TensorRT engine operator."); AddComment("TensorRT engine operator.");
} }
}; };
......
...@@ -28,8 +28,6 @@ ...@@ -28,8 +28,6 @@
namespace paddle { namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size); DECLARE_int32(tensorrt_engine_batch_size);
DECLARE_int32(tensorrt_max_batch_size);
DECLARE_int32(tensorrt_workspace_size);
namespace operators { namespace operators {
...@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto engine_name = context.Attr<std::string>("engine_uniq_key"); auto engine_name = context.Attr<std::string>("engine_uniq_key");
int max_batch_size = context.Attr<int>("max_batch_size");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) { if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context); Prepare(context);
} }
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name); auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs"); auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
FLAGS_tensorrt_max_batch_size);
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping"); context.Attr<std::vector<std::string>>("output_name_mapping");
...@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Get the ProgramDesc and pass to convert. // Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph")); block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
int max_batch = FLAGS_tensorrt_max_batch_size; int max_batch_size = context.Attr<int>("max_batch_size");
auto max_workspace = FLAGS_tensorrt_workspace_size; int workspace_size = context.Attr<int>("workspace_size");
auto params = context.Attr<std::vector<std::string>>("parameters"); auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
for (const auto& param : params) { for (const auto& param : params) {
...@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// TODO(Superjomn) replace this with a different stream // TODO(Superjomn) replace this with a different stream
auto* engine = Singleton<TRT_EngineManager>::Global().Create( auto* engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch, max_workspace, nullptr /*engine hold its own stream*/, max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"), context.Attr<std::string>("engine_uniq_key"),
boost::get<platform::CUDAPlace>(context.GetPlace()).device); boost::get<platform::CUDAPlace>(context.GetPlace()).device);
......
...@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, ...@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
using inference::analysis::SetAttr; using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) { TEST(TensorRTEngineOp, manual) {
FLAGS_tensorrt_engine_batch_size = 2;
FLAGS_tensorrt_max_batch_size = 2;
framework::ProgramDesc program; framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0); block_->set_idx(0);
...@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) { ...@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters", SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
std::vector<std::string>({})); std::vector<std::string>({}));
...@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) { ...@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
} }
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
FLAGS_tensorrt_engine_batch_size = batch_size;
FLAGS_tensorrt_max_batch_size = batch_size;
framework::ProgramDesc program; framework::ProgramDesc program;
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
...@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size); SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10); SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
SetAttr<std::vector<std::string>>( SetAttr<std::vector<std::string>>(
engine_op_desc.Proto(), "parameters", engine_op_desc.Proto(), "parameters",
std::vector<std::string>({"y0", "y1", "y2", "y3"})); std::vector<std::string>({"y0", "y1", "y2", "y3"}));
......
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method) set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
if(NOT WIN32) if(NOT WIN32)
list(APPEND PYBIND_DEPS parallel_executor profiler) list(APPEND PYBIND_DEPS parallel_executor profiler)
......
...@@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) { ...@@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) {
.def("set_output", &pd::OpDesc::SetOutput) .def("set_output", &pd::OpDesc::SetOutput)
.def("input_arg_names", &pd::OpDesc::InputArgumentNames) .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
.def("output_arg_names", &pd::OpDesc::OutputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
.def("rename_input", &pd::OpDesc::RenameInput) .def("_rename_input", &pd::OpDesc::RenameInput)
.def("rename_output", &pd::OpDesc::RenameOutput) .def("_rename_output", &pd::OpDesc::RenameOutput)
.def("has_attr", &pd::OpDesc::HasAttr) .def("has_attr", &pd::OpDesc::HasAttr)
.def("attr_type", &pd::OpDesc::GetAttrType) .def("attr_type", &pd::OpDesc::GetAttrType)
.def("attr_names", &pd::OpDesc::AttrNames) .def("attr_names", &pd::OpDesc::AttrNames)
.def("set_attr", &pd::OpDesc::SetAttr) .def("_set_attr", &pd::OpDesc::SetAttr)
.def("attr", &pd::OpDesc::GetAttr) .def("attr", &pd::OpDesc::GetAttr)
.def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
.def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
...@@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) { ...@@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) {
std::string ser(seriralized); std::string ser(seriralized);
self.SetAttr(name, ser); self.SetAttr(name, ser);
}) })
.def("block_attr_id", &pd::OpDesc::GetBlockAttrId) .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
.def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
.def("check_attrs", &pd::OpDesc::CheckAttrs) .def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape) .def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType) .def("infer_var_type", &pd::OpDesc::InferVarType)
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
...@@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init())
.def("set_str", [](ir::Pass &self, const std::string &name,
const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
});
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
m, "PassBuilder");
pb.def(py::init())
.def("append_pass",
[](ir::PassBuilder &self,
const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
return self.AppendPass(pass_type);
})
.def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
.def("insert_pass",
[](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
return self.InsertPass(idx, pass_type);
})
.def("remove_pass",
[](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
// -- python binds for parallel executor. // -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor"); py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy"); py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
...@@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle.
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
self.fuse_elewise_add_act_ops_ = b; self.fuse_elewise_add_act_ops_ = b;
}); })
.def("_create_passes_from_strategy",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy();
});
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::unordered_set<std::string> &,
......
...@@ -56,13 +56,13 @@ struct Style { ...@@ -56,13 +56,13 @@ struct Style {
}; };
template <typename... Args> template <typename... Args>
static void PrettyLogEndl(const std::string& style, const char* fmt, static void PrettyLogEndl(const std::string &style, const char *fmt,
const Args&... args) { const Args &... args) {
std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
} }
template <typename... Args> template <typename... Args>
static void PrettyLog(const std::string& style, const char* fmt, static void PrettyLog(const std::string &style, const char *fmt,
const Args&... args) { const Args &... args) {
std::cerr << style << Sprintf(fmt, args...) << reset(); std::cerr << style << Sprintf(fmt, args...) << reset();
} }
......
function(train_test TARGET_NAME)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(train_test_ARGS)
foreach(arg ${train_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_train_${TARGET_NAME}${arg}
SRCS test_train_${TARGET_NAME}.cc
DEPS paddle_fluid_origin
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
endforeach()
endfunction(train_test)
if(WITH_TESTING)
train_test(recognize_digits ARGS mlp conv)
endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <fstream>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
DEFINE_string(dirname, "", "Directory of the train model.");
namespace paddle {
void Train() {
CHECK(!FLAGS_dirname.empty());
framework::InitDevices(false);
const auto cpu_place = platform::CPUPlace();
framework::Executor executor(cpu_place);
framework::Scope scope;
auto train_program = inference::Load(
&executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
FLAGS_dirname + "__params_combined__");
std::string loss_name = "";
for (auto op_desc : train_program->Block(0).AllOps()) {
if (op_desc->Type() == "mean") {
loss_name = op_desc->Output("Out")[0];
break;
}
}
PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
// prepare data
auto x_var = scope.Var("img");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
x_tensor->Resize({64, 1, 28, 28});
auto x_data = x_tensor->mutable_data<float>(cpu_place);
for (int i = 0; i < 64 * 28 * 28; ++i) {
x_data[i] = 1.0;
}
auto y_var = scope.Var("label");
auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
y_tensor->Resize({64, 1});
auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
for (int i = 0; i < 64 * 1; ++i) {
y_data[i] = static_cast<int64_t>(1);
}
auto loss_var = scope.Var(loss_name);
float first_loss = 0.0;
float last_loss = 0.0;
for (int i = 0; i < 100; ++i) {
executor.Run(*train_program.get(), &scope, 0, false, true);
if (i == 0) {
first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
} else if (i == 99) {
last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
}
}
EXPECT_LT(last_loss, first_loss);
}
TEST(train, recognize_digits) { Train(); }
} // namespace paddle
...@@ -70,8 +70,8 @@ function cmake_gen() { ...@@ -70,8 +70,8 @@ function cmake_gen() {
PYTHON_FLAGS="" PYTHON_FLAGS=""
SYSTEM=`uname -s` SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ]; then if [ "$SYSTEM" == "Darwin" ]; then
echo "Using python abi: $1"
if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
echo "using python abi: $1"
if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
...@@ -82,7 +82,18 @@ function cmake_gen() { ...@@ -82,7 +82,18 @@ function cmake_gen() {
else else
exit 1 exit 1
fi fi
# TODO: qiyang add python3 part here elif [ "$1" == "cp35-cp35m" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
else
exit 1
fi
fi fi
else else
if [ "$1" != "" ]; then if [ "$1" != "" ]; then
...@@ -381,7 +392,7 @@ function run_mac_test() { ...@@ -381,7 +392,7 @@ function run_mac_test() {
EOF EOF
# TODO: jiabin need to refine this part when these tests fixed on mac # TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j8 ctest --output-on-failure -j $1
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
pip install /usr/local/opt/paddle/share/wheels/*.whl pip install /usr/local/opt/paddle/share/wheels/*.whl
...@@ -629,10 +640,10 @@ EOF ...@@ -629,10 +640,10 @@ EOF
function gen_capi_package() { function gen_capi_package() {
if [[ ${WITH_C_API} == "ON" ]]; then if [[ ${WITH_C_API} == "ON" ]]; then
install_prefix="${PADDLE_ROOT}/build/capi_output" capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
rm -rf $install_prefix rm -rf $capi_install_prefix
make DESTDIR="$install_prefix" install make DESTDIR="$capi_install_prefix" install
cd $install_prefix/usr/local cd $capi_install_prefix/
ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
fi fi
} }
...@@ -729,7 +740,11 @@ function main() { ...@@ -729,7 +740,11 @@ function main() {
maccheck) maccheck)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build_mac build_mac
run_mac_test run_mac_test ${PROC_RUN:-1}
;;
macbuild)
cmake_gen ${PYTHON_ABI:-""}
build_mac
;; ;;
cicheck_py35) cicheck_py35)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
......
...@@ -87,6 +87,7 @@ if (WITH_TESTING) ...@@ -87,6 +87,7 @@ if (WITH_TESTING)
endif() endif()
endif() endif()
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/fluid/tests)
add_subdirectory(paddle/fluid/contrib/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
retry_limit = 3 retry_limit = 3
while not (os.path.exists(filename) and md5file(filename) == md5sum): while not (os.path.exists(filename) and md5file(filename) == md5sum):
if os.path.exists(filename): if os.path.exists(filename):
print("file md5", md5file(filename), md5sum) sys.stderr.write("file %s md5 %s" % (md5file(filename), md5sum))
if retry < retry_limit: if retry < retry_limit:
retry += 1 retry += 1
else: else:
raise RuntimeError("Cannot download {0} within retry limit {1}". raise RuntimeError("Cannot download {0} within retry limit {1}".
format(url, retry_limit)) format(url, retry_limit))
print("Cache file %s not found, downloading %s" % (filename, url)) sys.stderr.write("Cache file %s not found, downloading %s" %
(filename, url))
r = requests.get(url, stream=True) r = requests.get(url, stream=True)
total_length = r.headers.get('content-length') total_length = r.headers.get('content-length')
...@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
dl += len(data) dl += len(data)
f.write(data) f.write(data)
done = int(50 * dl / total_length) done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, sys.stderr.write("\r[%s%s]" % ('=' * done,
' ' * (50 - done))) ' ' * (50 - done)))
sys.stdout.flush() sys.stdout.flush()
sys.stderr.write("\n")
sys.stdout.flush()
return filename return filename
......
...@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size): ...@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size):
] ]
for name in names: for name in names:
for line in f.extractfile(name): for line in f.extractfile(name):
line_split = line.strip().split(six.b('\t')) line = cpt.to_text(line)
line_split = line.strip().split('\t')
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_seq = line_split[0] # one source sequence src_seq = line_split[0] # one source sequence
......
...@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
word_dict = defaultdict(int) word_dict = defaultdict(int)
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile("wmt16/train"): for line in f.extractfile("wmt16/train"):
line_split = line.strip().split(six.b("\t")) line = cpt.to_text(line)
line_split = line.strip().split("\t")
if len(line_split) != 2: continue if len(line_split) != 2: continue
sen = line_split[0] if lang == "en" else line_split[1] sen = line_split[0] if lang == "en" else line_split[1]
for w in sen.split(): for w in sen.split():
...@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): ...@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile(file_name): for line in f.extractfile(file_name):
line_split = line.strip().split(six.b("\t")) line = cpt.to_text(line)
line_split = line.strip().split("\t")
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_words = line_split[src_col].split() src_words = line_split[src_col].split()
......
...@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): ...@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
op_desc = op_descs[i] op_desc = op_descs[i]
if isinstance(op_desc, tuple): if isinstance(op_desc, tuple):
op_desc = op_desc[0] op_desc = op_desc[0]
op_desc.rename_input(old_name, new_name) op_desc._rename_input(old_name, new_name)
op_desc.rename_output(old_name, new_name) op_desc._rename_output(old_name, new_name)
def _create_op_desc_(op_type, inputs, outputs, attrs): def _create_op_desc_(op_type, inputs, outputs, attrs):
...@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): ...@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
if isinstance(val, framework.Block): if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc) op_desc.set_block_attr(name, val.desc)
else: else:
op_desc.set_attr(name, val) op_desc._set_attr(name, val)
return op_desc return op_desc
...@@ -346,7 +346,7 @@ def _append_backward_ops_(block, ...@@ -346,7 +346,7 @@ def _append_backward_ops_(block,
grad_sub_block_list = [] grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first # If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"): if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr_id("sub_block")) sub_block = program.block(op._block_attr_id("sub_block"))
grad_sub_block = program._create_block() grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op) cb = _callback_lookup_(op)
...@@ -382,7 +382,7 @@ def _append_backward_ops_(block, ...@@ -382,7 +382,7 @@ def _append_backward_ops_(block,
for op_desc in grad_op_descs: for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op() new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc) new_op_desc.copy_from(op_desc)
new_op_desc.set_attr(op_role_attr_name, backward) new_op_desc._set_attr(op_role_attr_name, backward)
grad_to_var["__current_op_desc__"] = new_op_desc grad_to_var["__current_op_desc__"] = new_op_desc
if callbacks is not None: if callbacks is not None:
assert (isinstance(callbacks, list)) assert (isinstance(callbacks, list))
...@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()): for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"): if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr_id("sub_block")) sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set() new_vars = set()
# create new gradient variables # create new gradient variables
...@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): ...@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
for name in op_desc.input_arg_names(): for name in op_desc.input_arg_names():
if name in var_map: if name in var_map:
op_desc.rename_input(name, var_map[name]) op_desc._rename_input(name, var_map[name])
for name in op_desc.output_arg_names(): for name in op_desc.output_arg_names():
if block.desc.find_var(name.encode("ascii")): if block.desc.find_var(name.encode("ascii")):
new_name = unique_name.generate(name) new_name = unique_name.generate(name)
op_desc.rename_output(name, new_name) op_desc._rename_output(name, new_name)
var_map[name] = new_name var_map[name] = new_name
for g, ng in six.iteritems(var_map): for g, ng in six.iteritems(var_map):
...@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
if loss.op is None: if loss.op is None:
raise ValueError("loss.op is None. Should not happend") raise ValueError("loss.op is None. Should not happend")
loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(), loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Forward) |
int(core.op_proto_and_checker_maker.OpRole.Loss)) int(core.op_proto_and_checker_maker.OpRole.Loss))
if callbacks is not None: if callbacks is not None:
isinstance(callbacks, list) isinstance(callbacks, list)
...@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
attr_val = [p.name, g.name] attr_val = [p.name, g.name]
if g.op.has_attr(op_role_var_attr_name): if g.op.has_attr(op_role_var_attr_name):
attr_val.extend(g.op.attr(op_role_var_attr_name)) attr_val.extend(g.op.attr(op_role_var_attr_name))
g.op.set_attr(op_role_var_attr_name, attr_val) g.op._set_attr(op_role_var_attr_name, attr_val)
return params_and_grads return params_and_grads
......
...@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr): ...@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr):
clip_op_desc.set_type("clip") clip_op_desc.set_type("clip")
clip_op_desc.set_input("X", [grad_name]) clip_op_desc.set_input("X", [grad_name])
clip_op_desc.set_output("Out", [grad_name]) clip_op_desc.set_output("Out", [grad_name])
clip_op_desc.set_attr("min", self.min) clip_op_desc._set_attr("min", self.min)
clip_op_desc.set_attr("max", self.max) clip_op_desc._set_attr("max", self.max)
def error_clip_callback(block, context): def error_clip_callback(block, context):
......
...@@ -18,5 +18,13 @@ from . import decoder ...@@ -18,5 +18,13 @@ from . import decoder
from .decoder import * from .decoder import *
from . import memory_usage_calc from . import memory_usage_calc
from .memory_usage_calc import * from .memory_usage_calc import *
from . import op_frequence
from .op_frequence import *
from . import quantize
from .quantize import *
__all__ = decoder.__all__ + memory_usage_calc.__all__ __all__ = []
__all__ += decoder.__all__
__all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__
__all__ += quantize.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from collections import OrderedDict
from ..framework import Program
__all__ = ['op_freq_statistic']
def op_freq_statistic(program):
"""
Statistics of Op frequency.
Args:
program(Program): The current Program.
Returns:
uni_op_freq(dict): the single op frequency.
adj_2_op_freq(dict): the two adjacent ops frequency.
Examples:
>>> import paddle.fluid as fluid
>>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
>>> fluid.default_main_program())
>>> for op_type, op_num in uni_op_freq:
>>> print("%s \t %d" % (op_type, op_num))
>>> for op_type, op_num in adj_2_op_freq:
>>> print("%s \t %d" % (op_type, op_num))
"""
if not isinstance(program, Program):
raise TypeError("The input type should be Porgram."
"But you passed in %s" % (type(program)))
uni_op_freq = OrderedDict()
adj_2_op_freq = OrderedDict()
op_in_ops = OrderedDict()
parameters = [p.name for p in program.blocks[0].all_parameters()]
# get uni_op_freq
for op in program.global_block().ops:
had_recorded = False
for var_name in op.output_arg_names:
if var_name in parameters:
continue
if not had_recorded and uni_op_freq.has_key(op.type):
uni_op_freq[op.type] += 1
had_recorded = True
elif not had_recorded:
uni_op_freq[op.type] = 1
had_recorded = True
# get adj_2_op_freq
var_gen_op = {}
for op in program.global_block().ops:
for var_name in op.input_arg_names:
if var_name in parameters:
continue
if var_gen_op.has_key(var_name):
assert len(var_gen_op[var_name]) > 0
if op_in_ops.has_key(op.type):
op_in_ops[op.type].append(var_gen_op[var_name][-1])
else:
op_in_ops[op.type] = [var_gen_op[var_name][-1]]
else:
print("Var's generate op is not found,%s, %s" %
(var_name, op.type))
for var_name in op.output_arg_names:
if var_gen_op.has_key(var_name):
var_gen_op[var_name].append(op.type)
else:
var_gen_op[var_name] = [op.type]
for op, in_ops in op_in_ops.iteritems():
for in_op in in_ops:
op_op = in_op + "->" + op
if adj_2_op_freq.has_key(op_op):
adj_2_op_freq[op_op] += 1
else:
adj_2_op_freq[op_op] = 1
uni_op_freq = sorted(
uni_op_freq.items(), key=lambda item: item[1], reverse=True)
adj_2_op_freq = sorted(
adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
return uni_op_freq, adj_2_op_freq
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import quantize_transpiler
from .quantize_transpiler import *
__all__ = quantize_transpiler.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import numpy as np
from paddle.fluid.framework import default_main_program, default_startup_program, program_guard
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import unique_name
from paddle.fluid import core
from paddle.fluid.initializer import Constant
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.layers.nn import autoincreased_step_counter
from paddle.fluid.framework import Variable
from paddle.fluid.executor import global_scope
from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler
__all__ = ['QuantizeTranspiler']
_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
def _quantized_var_name(var_name):
"""
Return quantized variable name for the input `var_name`.
"""
return "%s.quantized" % (var_name)
def _dequantized_var_name(var_name):
"""
Return dequantized variable name for the input `var_name`.
"""
return "%s.dequantized" % (var_name)
def _quantized_scale_name(var_name):
"""
Return quantized variable name for the input `var_name`.
"""
return "%s.scale" % (var_name)
def _original_var_name(var_name):
"""
Return the original variable name.
"""
if var_name.endswith('.quantized.dequantized'):
return var_name[:-len('.quantized.dequantized')]
if var_name.endswith('.quantized'):
return var_name[:-len('.quantized')]
if var_name.endswith('.dequantized'):
return var_name[:-len('.dequantized')]
if var_name.endswith('.scale'):
return var_name[:-len('.scale')]
else:
return var_name
def _is_float(v):
return isinstance(v, float) or isinstance(v, np.float32)
def quant(x, scale, num_bits):
y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
return y
class QuantizeTranspiler(object):
def __init__(self,
weight_bits=8,
activation_bits=8,
activation_quantize_type='abs_max',
weight_quantize_type='abs_max',
window_size=10000):
"""
Convert and rewrite the fluid Program according to weight and
activation quantization type.
Args:
weight_bits (int): quantization bit number for weights,
the bias is not quantized.
activation_bits (int): quantization bit number for activation.
activation_quantize_type (str): quantization type for activation,
now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
the quantization scale will be calculated dynamically each step
in both training and testing period. If use 'range_abs_max',
a static quantization scale will be calculated during training
and used in inference.
weight_quantize_type (str): quantization type for weights,
support 'abs_max'. The 'range_abs_max' usually is not used for
weight, since weights are fixed once the model is well trained.
window_size (int): the window size for 'range_abs_max' quantization.
Examples:
.. code-block:: python
# the original program will be rewrite, if you don't want to
# change it, please clone at first.
# quantize_program = program.clone()
t = fluid.QuantizeTranspiler()
t.transpile(quantize_program)
"""
self.weight_bits = weight_bits
self.activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max']
if weight_quantize_type not in quant_type:
raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
if activation_quantize_type not in quant_type:
raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
self.weight_quantize_type = weight_quantize_type
self.activation_quantize_type = activation_quantize_type
self.window_size = window_size
self.helper = LayerHelper(self.__class__.__name__)
self.fake_quant_op_types = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
]
self.fake_dequant_op_types = ['fake_dequantize_max_abs']
self.is_test = None
self.global_step = None
def training_transpile(self, program=None, startup_program=None):
"""Rewrites a training input program in place for simulated
quantization. Insert fake quantization and de-quantization ops into
program to simulate the error introduced by quantization. And change
the graident ops' input by using the faked quantization weights and
activation. Since the program is transformed in place, the graph
connection will change.
Args:
program (Program): the input program to be transpile.
"""
self.is_test = False
program = default_main_program() if program is None else program
startup_program = default_startup_program() if startup_program is \
None else startup_program
# marked the variable which has been quantized and dequantized.
dequanted_vars = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES]
params = [p.name for p in program.global_block().iter_parameters()]
def _transpile_forward(block, op):
idx = block.ops.index(op)
block_id = block.idx
# insert quant op and dequant op
for name in op.input_arg_names:
if name in dequanted_vars[block_id]:
dequant_var = dequanted_vars[block_id][name]
else:
var = block.var(name)
quant_bits = self.weight_bits if var.name in params \
else self.activation_bits
quant_type = self.weight_quantize_type if var.name \
in params else self.activation_quantize_type
quant_var, scale_var = self._insert_quant_op(
block, idx, var, quant_bits, quant_type)
dequant_var = self._insert_dequant_op(
block, idx + 1, quant_var, scale_var, quant_bits)
dequanted_vars[block_id][name] = dequant_var
# rename the forward op inputs
op._rename_input(name, dequant_var.name)
def _transpile_backward(block, op):
block_id = block.idx
no_dequanted_input_vars = True
for name in op.input_arg_names:
if name in dequanted_vars[block_id]:
dequant_var = dequanted_vars[block_id][name]
op._rename_input(name, dequant_var.name)
no_dequanted_input_vars = False
if no_dequanted_input_vars:
raise ValueError("There is no dequanted inputs for op %s." %
(op.type))
with program_guard(program, startup_program):
self._create_global_step()
for block in program.blocks:
ops = list(block.ops)
block_id = block.idx
for op in ops:
# rewrite the forward ProgramDes
if op.type in _QUANTIZABLE_OP_TYPES:
_transpile_forward(block, op)
# rename the backward op inputs
if op.type in grad_op_types:
_transpile_backward(block, op)
def _create_global_step(self):
if self.weight_quantize_type == 'range_abs_max' or \
self.activation_quantize_type == 'range_abs_max':
self.global_step = autoincreased_step_counter()
def freeze_program(self, program, place, fuse_bn=False, scope=None):
"""Freeze input training program for inference.
Args:
program (Program): the input program to be transpile.
"""
self.is_test = True
scope = global_scope() if scope is None else scope
program = default_main_program() if program is None else program
if fuse_bn:
bn_fuse_transpiler = BNFuseTranspiler()
bn_fuse_transpiler.transpile(program, place)
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, program.list_vars())
]
op_in_rename_map = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
op_out_rename_map = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
var_scale_map = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
def _remove_fake_quant_and_dequant_op(block, op):
idx = block.ops.index(op)
block_id = block.idx
k = op.output('Out')[0]
v = op.input('X')[0]
if v not in op_in_rename_map[block_id]:
op_in_rename_map[block_id][k] = v
else:
op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v]
block._remove_op(idx)
def _insert_post_dequant_op(block, op):
idx = block.ops.index(op)
block_id = block.idx
max_range = None
scale_var = None
for name in op.input_arg_names:
if name in op_in_rename_map[block_id]:
op._rename_input(name, op_in_rename_map[block_id][name])
scale_v = var_scale_map[block_id][_original_var_name(name)]
if _original_var_name(name) in persistable_vars:
param_range = (1 << (self.weight_bits - 1)) - 1
act_range = (1 << (self.activation_bits - 1)) - 1
assert _is_float(scale_v)
max_range = param_range * act_range / scale_v
else:
assert isinstance(scale_v, Variable)
scale_var = var_scale_map[block_id][_original_var_name(
name)]
if len(op.output_arg_names) != 1:
raise ValueError("Only support one output, but op %s has"
" more than one output." % (op.type))
out_var = block.var(op.output_arg_names[0])
dequant_var = block.create_var(
name=_dequantized_var_name(out_var.name),
type=out_var.type,
shape=out_var.shape,
dtype=out_var.dtype)
# insert fake_dequantize_op
dequant_op = block._insert_op(
idx + 1,
type="fake_dequantize_max_abs",
attrs={'max_range': float(max_range)},
inputs={"X": out_var,
'Scale': scale_var},
outputs={"Out": dequant_var})
op_out_rename_map[block_id][out_var.name] = dequant_var.name
return dequant_var
def _load_var(name):
return np.array(scope.find_var(name).get_tensor())
def _restore_var(name, arr):
t = scope.find_var(name).get_tensor()
t.set(arr, place)
for block in program.blocks:
ops = list(block.ops)
block_id = block.idx
for op in ops:
op_type = op.type
# insert dequant_op after fc/conv, need to rename
# input of the followed ops
for name in op.input_arg_names:
if name in op_out_rename_map[block_id]:
op._rename_input(name,
op_out_rename_map[block_id][name])
if op_type in self.fake_quant_op_types:
in_arg_name = op.input('X')[0]
if in_arg_name in persistable_vars:
if self.weight_quantize_type == 'abs_max':
param = _load_var(in_arg_name)
scale_v = np.max(np.abs(param))
else:
scale_v = _load_var(op.output('OutScale')[0])
var_scale_map[block_id][in_arg_name] = scale_v
else:
scale_v = block.var(op.output('OutScale')[0])
var_scale_map[block_id][in_arg_name] = scale_v
if in_arg_name in persistable_vars:
_remove_fake_quant_and_dequant_op(block, op)
# quantize weight and restore
param_t = _load_var(in_arg_name)
param_q_t = quant(param_t, scale_v, self.weight_bits)
_restore_var(in_arg_name, param_q_t)
if op_type in self.fake_dequant_op_types:
_remove_fake_quant_and_dequant_op(block, op)
if op_type in _QUANTIZABLE_OP_TYPES:
dequant_var = _insert_post_dequant_op(block, op)
# remove the unused var in ProgramDesc
self._remove_unused_var(program)
#program = program.clone()
def convert_to_int8(self, program, place, scope=None):
scope = global_scope() if scope is None else scope
program = default_main_program() if program is None else program
def _load_var(name):
return np.array(scope.find_var(name).get_tensor())
global_block = program.global_block()
def convert_to_int8(var):
int8_var_name = var.name + ".int8"
int8_var = global_block.create_parameter(
name=int8_var_name.encode('ascii'),
type=var.type,
dtype=core.VarDesc.VarType.INT8,
shape=var.shape)
tensor = _load_var(var.name)
scope.var(int8_var_name)
int8_tensor = scope.find_var(int8_var_name).get_tensor()
int8_tensor.set(tensor.astype(np.int8), place)
return int8_var
input_map = {}
for block in program.blocks:
for op in list(block.ops):
if op.type in _QUANTIZABLE_OP_TYPES:
for name in op.input_arg_names:
var = block.var(name)
if var.persistable:
if name not in input_map:
int8_var = convert_to_int8(var)
input_map[name] = int8_var.name
op._rename_input(name, input_map[name])
self._remove_unused_var(program)
def _remove_unused_var(self, program):
all_remove_vars = []
for block in program.blocks:
args = []
for op in block.ops:
args += op.input_arg_names
args += op.output_arg_names
args = list(set(args))
var_names = block.vars.keys()
sub_block_remove_vars = []
for var in var_names:
if var not in args:
sub_block_remove_vars.append(var)
all_remove_vars.append(sub_block_remove_vars)
remove_vars = [list(set(v)) for v in all_remove_vars]
for i, block in enumerate(program.blocks):
for v in remove_vars[i]:
block._remove_var(v)
def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
"""Insert fake_quantize_abs_max op.
"""
quant_var = block.create_var(
name=_quantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
scale = block.create_var(
name=_quantized_scale_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
quant_op = block._insert_op(
idx,
type='fake_quantize_abs_max',
attrs={'bit_length': quant_bits},
inputs={'X': var},
outputs={'Out': quant_var,
'OutScale': scale})
return quant_var, scale
def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
"""Insert fake_quantize_range_abs_max
"""
quant_var = block.create_var(
name=_quantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
scale = self.helper.create_parameter(
attr=ParamAttr(
name=_quantized_scale_name(var.name),
initializer=Constant(0.001),
trainable=False),
shape=[1],
dtype=var.dtype)
scale.stop_gradient = True
ins = {'X': var, 'InScale': scale}
outs = {'Out': quant_var, 'OutScale': scale}
if not self.is_test:
# A global step counter variable with type int64
scales = self.helper.create_global_variable(
name=unique_name.generate('scales'),
persistable=True,
dtype=var.dtype,
shape=[self.window_size])
self.helper.set_variable_initializer(
scales, initializer=Constant(value=0))
ins['Iter'] = self.global_step
outs['OutScales'] = scales
attrs = {
'window_size': self.window_size,
'bit_length': quant_bits,
'is_test': self.is_test
}
quant_op = block._insert_op(
idx,
type='fake_quantize_range_abs_max',
attrs=attrs,
inputs=ins,
outputs=outs)
return quant_var, scale
def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
"""
Insert fake_quantize_op
"""
if quant_type == 'abs_max':
return self._insert_quant_abs_max_op(block, idx, var, quant_bits)
elif quant_type == 'range_abs_max':
return self._insert_quant_range_abs_max_op(block, idx, var,
quant_bits)
def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
"""
Insert fake_quantize_op
"""
dequant_var = block.create_var(
name=_dequantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
# insert fake_dequantize_op
max_range = (1 << (quant_bits - 1)) - 1
dequant_op = block._insert_op(
idx,
type="fake_dequantize_max_abs",
attrs={'max_range': float(max_range)},
inputs={"X": var,
'Scale': scale},
outputs={"Out": dequant_var})
return dequant_var
class BNFuseTranspiler(InferenceTranspiler):
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
def _update_param(op, param_name, new_param):
var = self.block.vars[param_name]
tensor = self.scope.find_var(param_name).get_tensor()
tensor.set(np.array(new_param), self.place)
def _load_param(param_name):
return np.array(self.scope.find_var(param_name).get_tensor())
bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias
scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale
mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean
var_bn = _load_param(bn_op.input("Variance")[0]) #Variance
if current_op.type in ['conv2d', 'depthwise_conv2d']:
current_param = _load_param(
_original_var_name(current_op.input("Filter")[0]))
elif current_op.type == 'mul':
current_param = _load_param(
_original_var_name(current_op.input("Y")[0]))
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
tmp = np.float32(np.divide(scale_bn, std_bn))
# add bias of batch_norm_op to conv2d
if with_bias:
bias = _load_param(bias_op.input("Y"))
else:
bias = np.zeros(bias_bn.shape)
bias = np.float32(
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
# re-compute weight of conv2d/fc
tmp = tmp.reshape(tmp.shape[0], -1)
dst_param = current_param.reshape((tmp.shape[0], -1))
dst_param = np.float32(np.multiply(dst_param, tmp))
dst_param = dst_param.reshape(current_param.shape)
# update parameters
if current_op.type in ['conv2d', 'depthwise_conv2d']:
_update_param(current_op,
_original_var_name(current_op.input("Filter")[0]),
dst_param)
elif current_op.type == 'mul':
_update_param(current_op,
_original_var_name(current_op.input("Y")[0]),
dst_param)
_update_param(bias_op, bias_op.input("Y")[0], bias)
# collect the renamed input
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import numpy as np
import six
import unittest
import paddle
import paddle.fluid as fluid
from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
def linear_fc(num):
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data
for _ in six.moves.xrange(num):
hidden = fluid.layers.fc(hidden, size=128, act='relu')
loss = fluid.layers.cross_entropy(input=hidden, label=label)
loss = fluid.layers.mean(loss)
return loss
def residual_block(num):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data
for _ in six.moves.xrange(num):
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
fc = fluid.layers.fc(input=hidden, size=10)
loss = fluid.layers.cross_entropy(input=fc, label=label)
loss = fluid.layers.mean(loss)
return loss
def conv_net(img, label):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
return avg_loss
class TestQuantizeTranspiler(unittest.TestCase):
def setUp(self):
# since quant_op and dequant_op is not ready, use cos and sin for test
self.weight_quant_op_type = 'fake_quantize_abs_max'
self.dequant_op_type = 'fake_dequantize_max_abs'
self.quantizable_op_and_inputs = {
'conv2d': ['Input', 'Filter'],
'depthwise_conv2d': ['Input', 'Filter'],
'mul': ['X', 'Y']
}
self.quantizable_op_grad_and_inputs = {
'conv2d_grad': ['Input', 'Filter'],
'depthwise_conv2d_grad': ['Input', 'Filter'],
'mul_grad': ['X', 'Y']
}
def check_program(self, program):
quantized_ops = {}
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, program.list_vars())
]
for block in program.blocks:
for idx, op in enumerate(block.ops):
# check forward
if op.type in self.quantizable_op_and_inputs:
for i, arg_name in enumerate(op.input_arg_names):
quant_op_type = self.weight_quant_op_type if \
_original_var_name(arg_name) \
in persistable_vars else self.act_quant_op_type
self.assertTrue(
arg_name.endswith('.quantized.dequantized'))
if arg_name not in quantized_ops:
self.assertEqual(block.ops[idx - 2 * i - 1].type,
self.dequant_op_type)
self.assertEqual(block.ops[idx - 2 * i - 2].type,
quant_op_type)
quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
else:
op_idx = block.ops.index(quantized_ops[arg_name])
self.assertLess(op_idx, idx)
# check backward
if op.type in self.quantizable_op_grad_and_inputs:
for pname in self.quantizable_op_grad_and_inputs[op.type]:
arg_name = op.input(pname)[0]
self.assertTrue(
arg_name.endswith('.quantized.dequantized'))
self.assertTrue(arg_name in quantized_ops)
def linear_fc_quant(self, quant_type):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = linear_fc(3)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
t = QuantizeTranspiler(activation_quantize_type=quant_type)
t.training_transpile(main)
self.check_program(main)
def test_linear_fc_quant_abs_max(self):
self.act_quant_op_type = 'fake_quantize_abs_max'
self.linear_fc_quant('abs_max')
def test_linear_fc_quant_range_abs_max(self):
self.act_quant_op_type = 'fake_quantize_range_abs_max'
self.linear_fc_quant('range_abs_max')
def residual_block_quant(self, quant_type):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = residual_block(2)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
t = QuantizeTranspiler(activation_quantize_type=quant_type)
t.training_transpile(main)
self.check_program(main)
def test_residual_block_abs_max(self):
self.act_quant_op_type = 'fake_quantize_abs_max'
self.residual_block_quant('abs_max')
def test_residual_block_range_abs_max(self):
self.act_quant_op_type = 'fake_quantize_range_abs_max'
self.residual_block_quant('range_abs_max')
def freeze_program(self, use_cuda):
def build_program(main, startup, is_test):
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
img = fluid.layers.data(
name='image', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
loss = conv_net(img, label)
if not is_test:
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
return [img, label], loss
main = fluid.Program()
startup = fluid.Program()
test_program = fluid.Program()
feeds, loss = build_program(main, startup, False)
build_program(test_program, startup, True)
test_program = test_program.clone(for_test=True)
quant_transpiler = QuantizeTranspiler()
quant_transpiler.training_transpile(main)
quant_transpiler.training_transpile(test_program)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
iter = 5
batch_size = 8
class_num = 10
exe.run(startup)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
with fluid.program_guard(main):
for _ in range(iter):
data = next(train_reader())
loss_v = exe.run(program=main,
feed=feeder.feed(data),
fetch_list=[loss])
with fluid.program_guard(test_program):
test_data = next(test_reader())
w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
test_program)
# Testing during training
test_loss1, w_quant = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[loss, w_var])
# Freeze program for inference, but the weight of fc/conv is still float type.
quant_transpiler.freeze_program(test_program, place)
test_loss2, = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[loss])
self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3)
w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
.get_tensor())
self.assertEqual(np.sum(w_freeze), np.sum(w_quant))
# Convert parameter to 8-bit.
quant_transpiler.convert_to_int8(test_program, place)
# Save the 8-bit parameter and model file.
fluid.io.save_inference_model('model_8bit', ['image', 'label'],
[loss], exe, test_program)
# Test whether the 8-bit parameter and model file can be loaded successfully.
[infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
exe)
# Check the loaded 8-bit weight.
w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
.get_tensor())
self.assertEqual(w_8bit.dtype, np.int8)
self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
def test_freeze_program_cuda(self):
if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard():
self.freeze_program(True)
def test_freeze_program_cpu(self):
with fluid.unique_name.guard():
self.freeze_program(False)
if __name__ == '__main__':
unittest.main()
...@@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None ...@@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None
__all__ = [ __all__ = [
'Program', 'Program',
'Operator',
'default_startup_program', 'default_startup_program',
'default_main_program', 'default_main_program',
'program_guard', 'program_guard',
'get_var',
'name_scope', 'name_scope',
] ]
...@@ -663,11 +661,11 @@ class Operator(object): ...@@ -663,11 +661,11 @@ class Operator(object):
self._update_desc_attr(attr_name, attr_val) self._update_desc_attr(attr_name, attr_val)
self.desc.check_attrs() self.desc.check_attrs()
if self.has_kernel(type): if self._has_kernel(type):
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc) self.desc.infer_shape(self.block.desc)
def has_kernel(self, op_type): def _has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET return op_type not in self.OP_WITHOUT_KERNEL_SET
def to_string(self, throw_on_error): def to_string(self, throw_on_error):
...@@ -708,7 +706,7 @@ class Operator(object): ...@@ -708,7 +706,7 @@ class Operator(object):
""" """
return self.desc.input(name) return self.desc.input(name)
def rename_input(self, old_name, new_name): def _rename_input(self, old_name, new_name):
""" """
Rename the `old_name` to `new_name`. Rename the `old_name` to `new_name`.
...@@ -719,9 +717,9 @@ class Operator(object): ...@@ -719,9 +717,9 @@ class Operator(object):
Returns: Returns:
None None
""" """
self.desc.rename_input(old_name, new_name) self.desc._rename_input(old_name, new_name)
def rename_output(self, old_name, new_name): def _rename_output(self, old_name, new_name):
""" """
Rename the `old_name` to `new_name`. Rename the `old_name` to `new_name`.
...@@ -732,7 +730,7 @@ class Operator(object): ...@@ -732,7 +730,7 @@ class Operator(object):
Returns: Returns:
None None
""" """
self.desc.rename_output(old_name, new_name) self.desc._rename_output(old_name, new_name)
@property @property
def input_names(self): def input_names(self):
...@@ -796,7 +794,7 @@ class Operator(object): ...@@ -796,7 +794,7 @@ class Operator(object):
""" """
return self.desc.attr_type(name) return self.desc.attr_type(name)
def set_attr(self, name, val): def _set_attr(self, name, val):
""" """
Set the value of attribute by attribute's name. Set the value of attribute by attribute's name.
...@@ -829,7 +827,7 @@ class Operator(object): ...@@ -829,7 +827,7 @@ class Operator(object):
isinstance(val, core.ProgramDesc): isinstance(val, core.ProgramDesc):
self.desc.set_serialized_attr(name, val.serialize_to_string()) self.desc.set_serialized_attr(name, val.serialize_to_string())
else: else:
self.desc.set_attr(name, val) self.desc._set_attr(name, val)
@property @property
def attr_names(self): def attr_names(self):
...@@ -848,7 +846,7 @@ class Operator(object): ...@@ -848,7 +846,7 @@ class Operator(object):
""" """
return self.desc.attr(name) return self.desc.attr(name)
def block_attr_id(self, name): def _block_attr_id(self, name):
""" """
Get the block attribute's id by name. Get the block attribute's id by name.
...@@ -858,9 +856,9 @@ class Operator(object): ...@@ -858,9 +856,9 @@ class Operator(object):
Returns: Returns:
int: the block index. int: the block index.
""" """
return self.desc.block_attr_id(name) return self.desc._block_attr_id(name)
def block_attr(self, name): def _block_attr(self, name):
""" """
Get the block attribute by name. Get the block attribute by name.
...@@ -871,11 +869,11 @@ class Operator(object): ...@@ -871,11 +869,11 @@ class Operator(object):
block: the block attribute. block: the block attribute.
""" """
id = self.block_attr_id(name) id = self._block_attr_id(name)
assert (id >= 0 and id < len(self.block.program.blocks)) assert (id >= 0 and id < len(self.block.program.blocks))
return self.block.program.blocks[id] return self.block.program.blocks[id]
def blocks_attr(self, name): def _blocks_attr(self, name):
""" """
Get the blocks attribute by name. Get the blocks attribute by name.
...@@ -886,13 +884,13 @@ class Operator(object): ...@@ -886,13 +884,13 @@ class Operator(object):
list: list of the blocks attribute. list: list of the blocks attribute.
""" """
attrs = [] attrs = []
for i in self.blocks_attr_ids(name): for i in self._blocks_attr_ids(name):
assert (i >= 0 and i < len(self.block.program.blocks)) assert (i >= 0 and i < len(self.block.program.blocks))
attrs.append(self.block.program.blocks[i]) attrs.append(self.block.program.blocks[i])
return attrs return attrs
def blocks_attr_ids(self, name): def _blocks_attr_ids(self, name):
""" """
Get the blocks attribute's ids by name. Get the blocks attribute's ids by name.
...@@ -903,7 +901,7 @@ class Operator(object): ...@@ -903,7 +901,7 @@ class Operator(object):
list: list of the blocks ids. list: list of the blocks ids.
""" """
return self.desc.blocks_attr_ids(name) return self.desc._blocks_attr_ids(name)
def all_attrs(self): def all_attrs(self):
""" """
...@@ -917,11 +915,11 @@ class Operator(object): ...@@ -917,11 +915,11 @@ class Operator(object):
for n in attr_names: for n in attr_names:
attr_type = self.desc.attr_type(n) attr_type = self.desc.attr_type(n)
if attr_type == core.AttrType.BLOCK: if attr_type == core.AttrType.BLOCK:
attr_map[n] = self.block_attr(n) attr_map[n] = self._block_attr(n)
continue continue
if attr_type == core.AttrType.BLOCKS: if attr_type == core.AttrType.BLOCKS:
attr_map[n] = self.blocks_attr(n) attr_map[n] = self._blocks_attr(n)
continue continue
attr_map[n] = self.attr(n) attr_map[n] = self.attr(n)
...@@ -1795,7 +1793,7 @@ class Program(object): ...@@ -1795,7 +1793,7 @@ class Program(object):
for j in six.moves.range(block.op_size()): for j in six.moves.range(block.op_size()):
op = block.op(j) op = block.op(j)
if op.has_attr('is_test'): if op.has_attr('is_test'):
op.set_attr('is_test', True) op._set_attr('is_test', True)
res.blocks = [ res.blocks = [
Block(res, i) for i in six.moves.range(res.desc.num_blocks()) Block(res, i) for i in six.moves.range(res.desc.num_blocks())
] ]
...@@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None): ...@@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None):
switch_startup_program(startup_program) switch_startup_program(startup_program)
def get_var(name, program=None): def _get_var(name, program=None):
""" """
Get a variable by name from the global block of a program. Get a variable by name from the global block of a program.
......
...@@ -600,7 +600,7 @@ def save_inference_model(dirname, ...@@ -600,7 +600,7 @@ def save_inference_model(dirname,
""" """
if isinstance(feeded_var_names, six.string_types): if isinstance(feeded_var_names, six.string_types):
feeded_var_names = [feeded_var_names] feeded_var_names = [feeded_var_names]
else: elif export_for_deployment:
if len(feeded_var_names) > 0: if len(feeded_var_names) > 0:
# TODO(paddle-dev): polish these code blocks # TODO(paddle-dev): polish these code blocks
if not (bool(feeded_var_names) and all( if not (bool(feeded_var_names) and all(
...@@ -610,61 +610,60 @@ def save_inference_model(dirname, ...@@ -610,61 +610,60 @@ def save_inference_model(dirname,
if isinstance(target_vars, Variable): if isinstance(target_vars, Variable):
target_vars = [target_vars] target_vars = [target_vars]
else: elif export_for_deployment:
if not (bool(target_vars) and all( if not (bool(target_vars) and all(
isinstance(var, Variable) for var in target_vars)): isinstance(var, Variable) for var in target_vars)):
raise ValueError("'target_vars' should be a list of Variable.") raise ValueError("'target_vars' should be a list of Variable.")
if main_program is None: if main_program is None:
main_program = default_main_program() main_program = default_main_program()
copy_program = main_program.clone()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
os.makedirs(dirname) os.makedirs(dirname)
if model_filename is not None:
model_basename = os.path.basename(model_filename)
else:
model_basename = "__model__"
model_basename = os.path.join(dirname, model_basename)
# When export_for_deployment is true, we modify the program online so that # When export_for_deployment is true, we modify the program online so that
# it can only be loaded for inference directly. If it's false, the whole # it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be # original program and related meta are saved so that future usage can be
# more flexible. # more flexible.
if export_for_deployment: if export_for_deployment:
global_block = copy_program.global_block() main_program = main_program.clone()
global_block = main_program.global_block()
for i, op in enumerate(global_block.ops): for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False) op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch": if op.type == "feed" or op.type == "fetch":
global_block._remove_op(i) global_block._remove_op(i)
copy_program.desc.flush() main_program.desc.flush()
pruned_program = copy_program._prune(targets=target_vars) main_program = main_program._prune(targets=target_vars)
saved_program = pruned_program._inference_optimize(prune_read_op=True) main_program = main_program._inference_optimize(prune_read_op=True)
fetch_var_names = [v.name for v in target_vars] fetch_var_names = [v.name for v in target_vars]
prepend_feed_ops(saved_program, feeded_var_names) prepend_feed_ops(main_program, feeded_var_names)
append_fetch_ops(saved_program, fetch_var_names) append_fetch_ops(main_program, fetch_var_names)
with open(model_basename, "wb") as f:
f.write(main_program.desc.serialize_to_string())
else: else:
# TODO(panyx0718): Save more information so that it can also be used # TODO(panyx0718): Save more information so that it can also be used
# for training and more flexible post-processing. # for training and more flexible post-processing.
saved_program = copy_program with open(model_basename + ".main_program", "wb") as f:
f.write(main_program.desc.serialize_to_string())
if model_filename is not None:
model_filename = os.path.basename(model_filename)
else:
model_filename = "__model__"
model_filename = os.path.join(dirname, model_filename)
if params_filename is not None: if params_filename is not None:
params_filename = os.path.basename(params_filename) params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename)
with open(model_filename, "wb") as f:
f.write(saved_program.desc.serialize_to_string())
save_persistables(executor, dirname, saved_program, params_filename)
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
def load_inference_model(dirname, def load_inference_model(dirname,
......
...@@ -284,7 +284,7 @@ def detection_output(loc, ...@@ -284,7 +284,7 @@ def detection_output(loc,
target_box=loc, target_box=loc,
code_type='decode_center_size') code_type='decode_center_size')
compile_shape = scores.shape compile_shape = scores.shape
run_shape = ops.shape(scores) run_shape = nn.shape(scores)
scores = nn.flatten(x=scores, axis=2) scores = nn.flatten(x=scores, axis=2)
scores = nn.softmax(input=scores) scores = nn.softmax(input=scores)
scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
...@@ -697,7 +697,7 @@ def ssd_loss(location, ...@@ -697,7 +697,7 @@ def ssd_loss(location,
raise ValueError("Only support mining_type == max_negative now.") raise ValueError("Only support mining_type == max_negative now.")
num, num_prior, num_class = confidence.shape num, num_prior, num_class = confidence.shape
conf_shape = ops.shape(confidence) conf_shape = nn.shape(confidence)
def __reshape_to_2d(var): def __reshape_to_2d(var):
return nn.flatten(x=var, axis=2) return nn.flatten(x=var, axis=2)
...@@ -724,7 +724,7 @@ def ssd_loss(location, ...@@ -724,7 +724,7 @@ def ssd_loss(location,
target_label.stop_gradient = True target_label.stop_gradient = True
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples # 3. Mining hard examples
actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2]) actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
actual_shape.stop_gradient = True actual_shape.stop_gradient = True
conf_loss = nn.reshape( conf_loss = nn.reshape(
x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
......
...@@ -507,7 +507,6 @@ def py_reader(capacity, ...@@ -507,7 +507,6 @@ def py_reader(capacity,
1. The basic usage of :code:`py_reader` is as follows: 1. The basic usage of :code:`py_reader` is as follows:
>>> import paddle.v2
>>> import paddle.fluid as fluid >>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist >>> import paddle.dataset.mnist as mnist
>>> >>>
...@@ -515,7 +514,7 @@ def py_reader(capacity, ...@@ -515,7 +514,7 @@ def py_reader(capacity,
>>> shapes=[(-1,3,224,224), (-1,1)], >>> shapes=[(-1,3,224,224), (-1,1)],
>>> dtypes=['float32', 'int64']) >>> dtypes=['float32', 'int64'])
>>> reader.decorate_paddle_reader( >>> reader.decorate_paddle_reader(
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) >>> paddle.reader.shuffle(paddle.batch(mnist.train())
>>> >>>
>>> img, label = fluid.layers.read_file(reader) >>> img, label = fluid.layers.read_file(reader)
>>> loss = network(img, label) # some network definition >>> loss = network(img, label) # some network definition
...@@ -534,7 +533,6 @@ def py_reader(capacity, ...@@ -534,7 +533,6 @@ def py_reader(capacity,
2. When training and testing are both performed, two different 2. When training and testing are both performed, two different
:code:`py_reader` should be created with different names, e.g.: :code:`py_reader` should be created with different names, e.g.:
>>> import paddle.v2
>>> import paddle.fluid as fluid >>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist >>> import paddle.dataset.mnist as mnist
>>> >>>
...@@ -548,7 +546,7 @@ def py_reader(capacity, ...@@ -548,7 +546,7 @@ def py_reader(capacity,
>>> dtypes=['float32', 'int64'], >>> dtypes=['float32', 'int64'],
>>> name='train_reader') >>> name='train_reader')
>>> train_reader.decorate_paddle_reader( >>> train_reader.decorate_paddle_reader(
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) >>> paddle.reader.shuffle(paddle.batch(mnist.train())
>>> >>>
>>> test_reader = fluid.layers.py_reader(capacity=32, >>> test_reader = fluid.layers.py_reader(capacity=32,
>>> shapes=[(-1,3,224,224), (-1,1)], >>> shapes=[(-1,3,224,224), (-1,1)],
......
...@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): def auc(input,
label,
curve='ROC',
num_thresholds=2**12 - 1,
topk=1,
slide_steps=1):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): ...@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
num_thresholds(int): The number of thresholds to use when discretizing num_thresholds(int): The number of thresholds to use when discretizing
the roc curve. Default 200. the roc curve. Default 200.
topk(int): only topk number of prediction output will be used for auc. topk(int): only topk number of prediction output will be used for auc.
slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
Returns: Returns:
Variable: A scalar representing the current AUC. Variable: A scalar representing the current AUC.
...@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): ...@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_tmp_variable(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64") batch_auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
# for batch auc
batch_stat_pos = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[slide_steps, num_thresholds + 1])
batch_stat_neg = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[slide_steps, num_thresholds + 1])
# for global auc
stat_pos = helper.create_global_variable( stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds + 1]) persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
stat_neg = helper.create_global_variable( stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds + 1]) persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
for var in [stat_pos, stat_neg]: for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
helper.set_variable_initializer( helper.set_variable_initializer(
var, Constant( var, Constant(
value=0.0, force_cpu=True)) value=0.0, force_cpu=True))
# Batch AUC
helper.append_op(
type="auc",
inputs={
"Predict": [input],
"Label": [label],
"StatPos": [batch_stat_pos],
"StatNeg": [batch_stat_neg]
},
attrs={
"curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": slide_steps
},
outputs={
"AUC": [batch_auc_out],
"StatPosOut": [batch_stat_pos],
"StatNegOut": [batch_stat_neg]
})
# Global AUC
helper.append_op( helper.append_op(
type="auc", type="auc",
inputs={ inputs={
...@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): ...@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
"StatPos": [stat_pos], "StatPos": [stat_pos],
"StatNeg": [stat_neg] "StatNeg": [stat_neg]
}, },
attrs={"curve": curve, attrs={
"num_thresholds": num_thresholds}, "curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": 0
},
outputs={ outputs={
"AUC": [auc_out], "AUC": [auc_out],
"BatchAUC": [batch_auc_out],
"StatPosOut": [stat_pos], "StatPosOut": [stat_pos],
"StatNegOut": [stat_neg] "StatNegOut": [stat_neg]
}) })
return auc_out, batch_auc_out, [stat_pos, stat_neg] return auc_out, batch_auc_out, [
batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
]
...@@ -29,110 +29,29 @@ from .. import unique_name ...@@ -29,110 +29,29 @@ from .. import unique_name
from functools import reduce from functools import reduce
__all__ = [ __all__ = [
'fc', 'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
'embedding', 'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
'dynamic_lstm', 'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d',
'dynamic_lstmp', 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d',
'dynamic_gru', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose',
'gru_unit', 'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit',
'linear_chain_crf', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
'crf_decoding', 'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
'cos_sim', 'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
'cross_entropy', 'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
'square_error_cost', 'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm',
'chunk_eval', 'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
'sequence_conv', 'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze',
'conv2d', 'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool',
'conv3d', 'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear',
'sequence_pool', 'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu',
'sequence_softmax', 'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid',
'softmax', 'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten',
'pool2d', 'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate',
'pool3d', 'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div',
'batch_norm', 'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
'beam_search_decode', 'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random',
'conv2d_transpose', 'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape'
'conv3d_transpose',
'sequence_expand',
'sequence_expand_as',
'sequence_pad',
'lstm_unit',
'reduce_sum',
'reduce_mean',
'reduce_max',
'reduce_min',
'reduce_prod',
'sequence_first_step',
'sequence_last_step',
'dropout',
'split',
'ctc_greedy_decoder',
'edit_distance',
'l2_normalize',
'matmul',
'topk',
'warpctc',
'sequence_reshape',
'transpose',
'im2sequence',
'nce',
'hsigmoid',
'beam_search',
'row_conv',
'multiplex',
'layer_norm',
'softmax_with_cross_entropy',
'smooth_l1',
'one_hot',
'autoincreased_step_counter',
'reshape',
'squeeze',
'unsqueeze',
'lod_reset',
'lrn',
'pad',
'pad_constant_like',
'label_smooth',
'roi_pool',
'dice_loss',
'image_resize',
'image_resize_short',
'resize_bilinear',
'gather',
'scatter',
'sequence_scatter',
'random_crop',
'mean_iou',
'relu',
'log',
'crop',
'rank_loss',
'elu',
'relu6',
'pow',
'stanh',
'hard_sigmoid',
'swish',
'prelu',
'brelu',
'leaky_relu',
'soft_relu',
'flatten',
'sequence_mask',
'stack',
'pad2d',
'unstack',
'sequence_enumerate',
'expand',
'sequence_concat',
'scale',
'elementwise_add',
'elementwise_div',
'elementwise_sub',
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
] ]
...@@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None): ...@@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None):
return out return out
from paddle.fluid.framework import convert_np_dtype_to_dtype_
@templatedoc()
def uniform_random_batch_size_like(input,
shape,
dtype='float32',
input_dim_idx=0,
output_dim_idx=0,
min=-1.0,
max=1.0,
seed=0):
"""
${comment}
Args:
input (Variable): ${input_comment}
shape (tuple|list): ${shape_comment}
input_dim_idx (Int): ${input_dim_idx_comment}
output_dim_idx (Int): ${output_dim_idx_comment}
min (Float): ${min_comment}
max (Float): ${max_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('uniform_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='uniform_random_batch_size_like',
inputs={'Input': input},
outputs={'Out': out},
attrs={
'shape': shape,
'input_dim_idx': input_dim_idx,
'output_dim_idx': output_dim_idx,
'min': min,
'max': max,
'seed': seed,
'dtype': c_dtype
})
return out
@templatedoc()
def gaussian_random(shape,
mean=0.0,
std=1.0,
seed=0,
dtype='float32',
use_mkldnn=False):
"""
${comment}
Args:
shape (tuple|list): ${shape_comment}
mean (Float): ${mean_comment}
std (Float): ${std_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
use_mkldnn (Bool): Only used in mkldnn kernel.
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('gaussian_random', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='gaussian_random',
outputs={'Out': out},
attrs={
'shape': shape,
'mean': mean,
'std': std,
'seed': seed,
'dtype': c_dtype,
'use_mkldnn': use_mkldnn
})
return out
@templatedoc()
def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
"""
${comment}
Args:
x (Variable): ${x_comment}
min (Float): ${min_comment}
max (Float): ${max_comment}
seed (Float): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('sampling_id', **locals())
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='sampling_id',
inputs={'X': x},
outputs={'Out': out},
attrs={'min': min,
'max': max,
'seed': seed})
return out
@templatedoc()
def gaussian_random_batch_size_like(input,
shape,
input_dim_idx=0,
output_dim_idx=0,
mean=0.0,
std=1.0,
seed=0,
dtype='float32'):
"""
${comment}
Args:
input (Variable): ${input_comment}
shape (tuple|list): ${shape_comment}
input_dim_idx (Int): ${input_dim_idx_comment}
output_dim_idx (Int): ${output_dim_idx_comment}
mean (Float): ${mean_comment}
std (Float): ${std_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('gaussian_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='gaussian_random_batch_size_like',
inputs={'Input': input},
outputs={'Out': out},
attrs={
'shape': shape,
'input_dim_idx': input_dim_idx,
'output_dim_idx': output_dim_idx,
'mean': mean,
'std': std,
'seed': seed,
'dtype': c_dtype
})
return out
@templatedoc()
def sum(x, use_mkldnn=False):
"""
${comment}
Args:
x (Variable): ${x_comment}
use_mkldnn (Bool): ${use_mkldnn_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('sum', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
helper.append_op(
type='sum',
inputs={'X': x},
outputs={'Out': out},
attrs={'use_mkldnn': use_mkldnn})
return out
@templatedoc()
def slice(input, axes, starts, ends):
"""
${comment}
Args:
input (Variable): ${input_comment}.
axes (List): ${axes_comment}
starts (List): ${starts_comment}
ends (List): ${ends_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('slice', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
helper.append_op(
type='slice',
inputs={'Input': input},
outputs={'Out': out},
attrs={'axes': axes,
'starts': starts,
'ends': ends})
return out
@templatedoc()
def shape(input):
"""
${comment}
Args:
input (Variable): ${input_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('shape', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
helper.append_op(
type='shape', inputs={'Input': input}, outputs={'Out': out})
return out
def _elementwise_op(helper): def _elementwise_op(helper):
op_type = helper.layer_type op_type = helper.layer_type
x = helper.kwargs.get('x', None) x = helper.kwargs.get('x', None)
......
...@@ -45,13 +45,6 @@ __all__ = [ ...@@ -45,13 +45,6 @@ __all__ = [
'logical_or', 'logical_or',
'logical_xor', 'logical_xor',
'logical_not', 'logical_not',
'uniform_random_batch_size_like',
'gaussian_random',
'sampling_id',
'gaussian_random_batch_size_like',
'sum',
'slice',
'shape',
'maxout', 'maxout',
] ]
...@@ -63,6 +56,8 @@ for _OP in set(__all__): ...@@ -63,6 +56,8 @@ for _OP in set(__all__):
# e.g.: test_program_code.py, test_dist_train.py # e.g.: test_program_code.py, test_dist_train.py
globals()['_scale'] = generate_layer_fn('scale') globals()['_scale'] = generate_layer_fn('scale')
globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
__all__ += __activations_noattr__ __all__ += __activations_noattr__
for _OP in set(__activations_noattr__): for _OP in set(__activations_noattr__):
......
...@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper ...@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper
from .regularizer import append_regularization_ops from .regularizer import append_regularization_ops
from .clip import append_gradient_clip_ops, error_clip_callback from .clip import append_gradient_clip_ops, error_clip_callback
from contextlib import contextmanager from contextlib import contextmanager
from .layers import ops
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
...@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer): ...@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer):
x=tmp, dtype='float32' if self._dtype == None else self._dtype) x=tmp, dtype='float32' if self._dtype == None else self._dtype)
sum = layers.cast( sum = layers.cast(
x=sum, dtype='float32' if self._dtype == None else self._dtype) x=sum, dtype='float32' if self._dtype == None else self._dtype)
layers.elementwise_div(x=sum, y=tmp, out=param) ops._elementwise_div(x=sum, y=tmp, out=param)
def _add_average_restore_op(self, block, param_grad): def _add_average_restore_op(self, block, param_grad):
param = block._clone_variable(param_grad[0]) param = block._clone_variable(param_grad[0])
......
...@@ -67,6 +67,7 @@ def train(nn_type, ...@@ -67,6 +67,7 @@ def train(nn_type,
use_cuda, use_cuda,
parallel, parallel,
save_dirname=None, save_dirname=None,
save_full_dirname=None,
model_filename=None, model_filename=None,
params_filename=None, params_filename=None,
is_local=True): is_local=True):
...@@ -143,6 +144,13 @@ def train(nn_type, ...@@ -143,6 +144,13 @@ def train(nn_type,
exe, exe,
model_filename=model_filename, model_filename=model_filename,
params_filename=params_filename) params_filename=params_filename)
if save_full_dirname is not None:
fluid.io.save_inference_model(
save_full_dirname, [], [],
exe,
model_filename=model_filename,
params_filename=params_filename,
export_for_deployment=False)
return return
else: else:
print( print(
...@@ -214,10 +222,12 @@ def infer(use_cuda, ...@@ -214,10 +222,12 @@ def infer(use_cuda,
def main(use_cuda, parallel, nn_type, combine): def main(use_cuda, parallel, nn_type, combine):
save_dirname = None save_dirname = None
save_full_dirname = None
model_filename = None model_filename = None
params_filename = None params_filename = None
if not use_cuda and not parallel: if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model" save_dirname = "recognize_digits_" + nn_type + ".inference.model"
save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
if combine == True: if combine == True:
model_filename = "__model_combined__" model_filename = "__model_combined__"
params_filename = "__params_combined__" params_filename = "__params_combined__"
...@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine): ...@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine):
use_cuda=use_cuda, use_cuda=use_cuda,
parallel=parallel, parallel=parallel,
save_dirname=save_dirname, save_dirname=save_dirname,
save_full_dirname=save_full_dirname,
model_filename=model_filename, model_filename=model_filename,
params_filename=params_filename) params_filename=params_filename)
infer( infer(
......
...@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl ...@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
if(APPLE) if(APPLE)
if(NOT WITH_DISTRIBUTE) if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_desc_clone)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import dist_ctr_reader
from test_dist_base import TestDistRunnerBase, runtime_main
IS_SPARSE = True
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
class TestDistCTR2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
""" network definition """
dnn_data = fluid.layers.data(
name="dnn_data",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
lr_data = fluid.layers.data(
name="lr_data",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
label = fluid.layers.data(
name="click",
shape=[-1, 1],
dtype="int64",
lod_level=0,
append_batch_size=False)
# build dnn model
dnn_layer_dims = [128, 64, 32, 1]
dnn_embedding = fluid.layers.embedding(
is_distributed=False,
input=dnn_data,
size=[dnn_input_dim, dnn_layer_dims[0]],
param_attr=fluid.ParamAttr(
name="deep_embedding",
initializer=fluid.initializer.Constant(value=0.01)),
is_sparse=IS_SPARSE)
dnn_pool = fluid.layers.sequence_pool(
input=dnn_embedding, pool_type="sum")
dnn_out = dnn_pool
for i, dim in enumerate(dnn_layer_dims[1:]):
fc = fluid.layers.fc(
input=dnn_out,
size=dim,
act="relu",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)),
name='dnn-fc-%d' % i)
dnn_out = fc
# build lr model
lr_embbding = fluid.layers.embedding(
is_distributed=False,
input=lr_data,
size=[lr_input_dim, 1],
param_attr=fluid.ParamAttr(
name="wide_embedding",
initializer=fluid.initializer.Constant(value=0.01)),
is_sparse=IS_SPARSE)
lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
acc = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
label=label)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
inference_program = paddle.fluid.default_main_program().clone()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
sgd_optimizer.minimize(avg_cost)
dataset = dist_ctr_reader.Dataset()
train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
return inference_program, avg_cost, train_reader, test_reader, None, predict
if __name__ == "__main__":
runtime_main(TestDistCTR2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import paddle
import tarfile
logging.basicConfig()
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
"""
avazu_ctr_data/train.txt
avazu_ctr_data/infer.txt
avazu_ctr_data/test.txt
avazu_ctr_data/data.meta.txt
"""
def read_data(file_name):
path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
tar = tarfile.open(path, "r:gz")
tar_info = None
for member in tar.getmembers():
if member.name.endswith(file_name):
tar_info = member
f = tar.extractfile(tar_info)
ret_lines = [_.decode('utf-8') for _ in f.readlines()]
return ret_lines
class TaskMode:
TRAIN_MODE = 0
TEST_MODE = 1
INFER_MODE = 2
def __init__(self, mode):
self.mode = mode
def is_train(self):
return self.mode == self.TRAIN_MODE
def is_test(self):
return self.mode == self.TEST_MODE
def is_infer(self):
return self.mode == self.INFER_MODE
@staticmethod
def create_train():
return TaskMode(TaskMode.TRAIN_MODE)
@staticmethod
def create_test():
return TaskMode(TaskMode.TEST_MODE)
@staticmethod
def create_infer():
return TaskMode(TaskMode.INFER_MODE)
class ModelType:
CLASSIFICATION = 0
REGRESSION = 1
def __init__(self, mode):
self.mode = mode
def is_classification(self):
return self.mode == self.CLASSIFICATION
def is_regression(self):
return self.mode == self.REGRESSION
@staticmethod
def create_classification():
return ModelType(ModelType.CLASSIFICATION)
@staticmethod
def create_regression():
return ModelType(ModelType.REGRESSION)
def load_dnn_input_record(sent):
return list(map(int, sent.split()))
def load_lr_input_record(sent):
res = []
for _ in [x.split(':') for x in sent.split()]:
res.append(int(_[0]))
return res
feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
class Dataset(object):
def train(self):
'''
Load trainset.
'''
file_name = "train.txt"
logger.info("load trainset from %s" % file_name)
mode = TaskMode.create_train()
return self._parse_creator(file_name, mode)
def test(self):
'''
Load testset.
'''
file_name = "test.txt"
logger.info("load testset from %s" % file_name)
mode = TaskMode.create_test()
return self._parse_creator(file_name, mode)
def infer(self):
'''
Load infer set.
'''
file_name = "infer.txt"
logger.info("load inferset from %s" % file_name)
mode = TaskMode.create_infer()
return self._parse_creator(file_name, mode)
def _parse_creator(self, file_name, mode):
'''
Parse dataset.
'''
def _parse():
data = read_data(file_name)
for line_id, line in enumerate(data):
fs = line.strip().split('\t')
dnn_input = load_dnn_input_record(fs[0])
lr_input = load_lr_input_record(fs[1])
if not mode.is_infer():
click = int(fs[2])
yield [dnn_input, lr_input, click]
else:
yield [dnn_input, lr_input]
return _parse
def load_data_meta():
'''
load data meta info from path, return (dnn_input_dim, lr_input_dim)
'''
lines = read_data('data.meta.txt')
err_info = "wrong meta format"
assert len(lines) == 2, err_info
assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
1], err_info
res = map(int, [_.split(':')[1] for _ in lines])
res = list(res)
logger.info('dnn input dim: %d' % res[0])
logger.info('lr input dim: %d' % res[1])
return res
...@@ -47,7 +47,7 @@ def cnn_model(data): ...@@ -47,7 +47,7 @@ def cnn_model(data):
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.3))) value=0.01)))
conv_pool_2 = fluid.nets.simple_img_conv_pool( conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
...@@ -56,7 +56,7 @@ def cnn_model(data): ...@@ -56,7 +56,7 @@ def cnn_model(data):
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.2))) value=0.01)))
SIZE = 10 SIZE = 10
input_shape = conv_pool_2.shape input_shape = conv_pool_2.shape
...@@ -68,7 +68,7 @@ def cnn_model(data): ...@@ -68,7 +68,7 @@ def cnn_model(data):
size=SIZE, size=SIZE,
act="softmax", act="softmax",
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1))) initializer=fluid.initializer.Constant(value=0.01)))
return predict return predict
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "int64"
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
# For Net
base_lr = 0.2
emb_lr = base_lr * 3
dict_dim = 1500
emb_dim = 128
hid_dim = 128
margin = 0.1
sample_rate = 1
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
def get_acc(cos_q_nt, cos_q_pt, batch_size):
cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
cond = fluid.layers.cast(cond, dtype='float64')
cond_3 = fluid.layers.reduce_sum(cond)
acc = fluid.layers.elementwise_div(
cond_3,
fluid.layers.fill_constant(
shape=[1], value=batch_size * 1.0, dtype='float64'),
name="simnet_acc")
return acc
def get_loss(cos_q_pt, cos_q_nt):
loss_op1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
cos_q_pt)
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_op2)
avg_cost = fluid.layers.mean(loss_op3)
return avg_cost
def get_optimizer():
# SGD optimizer
optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
return optimizer
def train_network(batch_size, is_distributed=False, is_sparse=False):
# query
q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
q_emb = fluid.layers.embedding(
input=q,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
q_ss = fluid.layers.softsign(q_sum)
## fc layer after conv
q_fc = fluid.layers.fc(
input=q_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__q_fc__",
learning_rate=base_lr))
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
# pt
pt = fluid.layers.data(
name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
pt_emb = fluid.layers.embedding(
input=pt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
pt_ss = fluid.layers.softsign(pt_sum)
## fc layer
pt_fc = fluid.layers.fc(
input=pt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
# nt
nt = fluid.layers.data(
name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
nt_emb = fluid.layers.embedding(
input=nt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
nt_ss = fluid.layers.softsign(nt_sum)
## fc layer
nt_fc = fluid.layers.fc(
input=nt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
# loss
avg_cost = get_loss(cos_q_pt, cos_q_nt)
# acc
acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
return [avg_cost, acc, cos_q_pt]
def combination(x, y):
res = [[[xi, yi] for yi in y] for xi in x]
return res[0]
def get_one_data(file_list):
for file in file_list:
contents = []
with open(file, "r") as fin:
for i in fin:
contents.append(i.strip())
for index, q in enumerate(contents):
try:
one_data = [[int(j) for j in i.split(" ")]
for i in q.split(";")[:-1]]
if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
q = fin.readline()
continue
tmp = combination(one_data[3:3 + one_data[1][0]],
one_data[3 + one_data[1][0]:])
except Exception as e:
continue
for each in tmp:
yield [one_data[2], 0, each[0], each[1]]
def get_batch_reader(file_list, batch_size):
def batch_reader():
res = []
for i in get_one_data(file_list):
if random.random() <= sample_rate:
res.append(i)
if len(res) >= batch_size:
yield res
res = []
return batch_reader
def get_train_reader(batch_size):
# The training data set.
train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
"train")
train_reader = get_batch_reader([train_file], batch_size)
train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
return train_reader, train_feed
class TestDistSimnetBow2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
# Train program
avg_cost, acc, predict = \
train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
inference_program = fluid.default_main_program().clone()
# Optimization
opt = get_optimizer()
opt.minimize(avg_cost)
# Reader
train_reader, _ = get_train_reader(batch_size)
return inference_program, avg_cost, train_reader, train_reader, acc, predict
if __name__ == "__main__":
paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
runtime_main(TestDistSimnetBow2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
import six
import tarfile
import string
import re
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "float32"
VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
# Load dictionary.
def load_vocab(filename):
vocab = {}
if six.PY2:
with open(filename, 'r') as f:
for idx, line in enumerate(f):
vocab[line.strip()] = idx
else:
with open(filename, 'r', encoding="utf-8") as f:
for idx, line in enumerate(f):
vocab[line.strip()] = idx
return vocab
def get_worddict(dict_path):
word_dict = load_vocab(dict_path)
word_dict["<unk>"] = len(word_dict)
dict_dim = len(word_dict)
return word_dict, dict_dim
def conv_net(input,
dict_dim,
emb_dim=128,
window_size=3,
num_filters=128,
fc0_dim=96,
class_dim=2):
emb = fluid.layers.embedding(
input=input,
size=[dict_dim, emb_dim],
is_sparse=False,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
conv_3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=num_filters,
filter_size=window_size,
act="tanh",
pool_type="max",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
fc_0 = fluid.layers.fc(
input=[conv_3],
size=fc0_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
prediction = fluid.layers.fc(
input=[fc_0],
size=class_dim,
act="softmax",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
return prediction
def inference_network(dict_dim):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
out = conv_net(data, dict_dim)
return out
def get_reader(word_dict, batch_size):
# The training data set.
train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
# The testing data set.
test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
return train_reader, test_reader
def get_optimizer(learning_rate):
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
return optimizer
class TestDistTextClassification2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
vocab = os.path.join(paddle.dataset.common.DATA_HOME,
"text_classification", "imdb.vocab")
word_dict, dict_dim = get_worddict(vocab)
# Input data
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = conv_net(data, dict_dim)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = get_optimizer(learning_rate=0.001)
opt.minimize(avg_cost)
# Reader
train_reader, test_reader = get_reader(word_dict, batch_size)
return inference_program, avg_cost, train_reader, test_reader, acc, predict
def tokenize(pattern):
"""
Read files that match the given pattern. Tokenize and yield each file.
"""
with tarfile.open(
paddle.dataset.common.download(DATA_URL, 'text_classification',
DATA_MD5)) as tarf:
# Note that we should use tarfile.next(), which does
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf = tarf.next()
while tf != None:
if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip(six.b(
"\n\r")).translate(
None, six.b(string.punctuation)).lower().split()
tf = tarf.next()
def reader_creator(pos_pattern, neg_pattern, word_idx):
UNK = word_idx['<unk>']
INS = []
def load(pattern, out, label):
for doc in tokenize(pattern):
out.append(([word_idx.get(w, UNK) for w in doc], label))
load(pos_pattern, INS, 0)
load(neg_pattern, INS, 1)
def reader():
for doc, label in INS:
yield doc, label
return reader
def train(word_idx):
"""
IMDB training set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
re.compile("train/pos/.*\.txt$"),
re.compile("train/neg/.*\.txt$"), word_idx)
def test(word_idx):
"""
IMDB test set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
"""
return reader_creator(
re.compile("test/pos/.*\.txt$"),
re.compile("test/neg/.*\.txt$"), word_idx)
if __name__ == "__main__":
paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
runtime_main(TestDistTextClassification2x2)
...@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size, ...@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size,
if weight_sharing: if weight_sharing:
predict = layers.matmul( predict = layers.matmul(
x=dec_output, x=dec_output,
y=fluid.get_var(word_emb_param_names[0]), y=fluid.framework._get_var(word_emb_param_names[0]),
transpose_y=True) transpose_y=True)
else: else:
predict = layers.fc(input=dec_output, predict = layers.fc(input=dec_output,
...@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase): ...@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
exe.run(startup_prog) exe.run(startup_prog)
exe.run(pserver_prog) exe.run(pserver_prog)
def run_trainer(self, use_cuda, args): def run_trainer(self, args):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() TrainTaskConfig.use_gpu = args.use_cuda
TrainTaskConfig.use_gpu = use_cuda sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
args.is_dist, not args.sync_mode) args.is_dist, not args.sync_mode)
if args.is_dist: if args.is_dist:
...@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase): ...@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
TrainTaskConfig.batch_size = 20 TrainTaskConfig.batch_size = 20
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
TrainTaskConfig.local = not args.is_dist TrainTaskConfig.local = not args.is_dist
......
...@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase): ...@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
if __name__ == "__main__": if __name__ == "__main__":
import os
os.environ['CPU_NUM'] = '1'
os.environ['USE_CUDA'] = "FALSE"
runtime_main(TestDistWord2vec2x2) runtime_main(TestDistWord2vec2x2)
...@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): ...@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t)) str(actual_t) + " in class " + self.__class__.__name__)
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
......
...@@ -36,7 +36,11 @@ class TestAucOp(OpTest): ...@@ -36,7 +36,11 @@ class TestAucOp(OpTest):
"StatPos": stat_pos, "StatPos": stat_pos,
"StatNeg": stat_neg "StatNeg": stat_neg
} }
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} self.attrs = {
'curve': 'ROC',
'num_thresholds': num_thresholds,
"slide_steps": 1
}
python_auc = metrics.Auc(name="auc", python_auc = metrics.Auc(name="auc",
curve='ROC', curve='ROC',
...@@ -45,7 +49,6 @@ class TestAucOp(OpTest): ...@@ -45,7 +49,6 @@ class TestAucOp(OpTest):
self.outputs = { self.outputs = {
'AUC': np.array(python_auc.eval()), 'AUC': np.array(python_auc.eval()),
'BatchAUC': np.array(python_auc.eval()),
'StatPosOut': np.array(python_auc._stat_pos), 'StatPosOut': np.array(python_auc._stat_pos),
'StatNegOut': np.array(python_auc._stat_neg) 'StatNegOut': np.array(python_auc._stat_neg)
} }
......
...@@ -20,6 +20,7 @@ import six ...@@ -20,6 +20,7 @@ import six
import sys import sys
import collections import collections
import math import math
import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
...@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): ...@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
self.detect = np.array(self.detect).astype('float32') self.detect = np.array(self.detect).astype('float32')
self.mAP = np.array(self.mAP).astype('float32') self.mAP = np.array(self.mAP).astype('float32')
if (len(self.class_pos_count) > 0): if len(self.class_pos_count) > 0:
self.class_pos_count = np.array(self.class_pos_count).astype( self.class_pos_count = np.array(self.class_pos_count).astype(
'int32') 'int32')
self.true_pos = np.array(self.true_pos).astype('float32') self.true_pos = np.array(self.true_pos).astype('float32')
...@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): ...@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
def init_test_case(self): def init_test_case(self):
super(TestDetectionMAPOpMultiBatch, self).init_test_case() super(TestDetectionMAPOpMultiBatch, self).init_test_case()
self.class_pos_count = [0, 2, 1] self.class_pos_count = [0, 2, 1, 0]
self.true_pos_lod = [[0, 3, 2]] self.true_pos_lod = [[0, 3, 2]]
self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
self.false_pos_lod = [[0, 3, 2]] self.false_pos_lod = [[0, 3, 2]]
......
...@@ -18,23 +18,27 @@ import time ...@@ -18,23 +18,27 @@ import time
import unittest import unittest
import os import os
import sys import sys
import six
import signal import signal
import subprocess import subprocess
import six
import argparse import argparse
import paddle.fluid as fluid
RUN_STEP = 10
class TestDistRunnerBase(object): class TestDistRunnerBase(object):
def get_model(self, batch_size=2): def get_model(self, batch_size=2):
raise NotImplementedError( raise NotImplementedError(
"get_model should be implemented by child classes.") "get_model should be implemented by child classes.")
def get_transpiler(self, trainer_id, main_program, pserver_endpoints, @staticmethod
trainers, sync_mode): def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers,
sync_mode):
# NOTE: import fluid until runtime, or else forking processes will cause error. # NOTE: import fluid until runtime, or else forking processes will cause error.
import paddle config = fluid.DistributeTranspilerConfig()
import paddle.fluid as fluid t = fluid.DistributeTranspiler(config=config)
t = fluid.DistributeTranspiler()
t.transpile( t.transpile(
trainer_id=trainer_id, trainer_id=trainer_id,
program=main_program, program=main_program,
...@@ -44,11 +48,9 @@ class TestDistRunnerBase(object): ...@@ -44,11 +48,9 @@ class TestDistRunnerBase(object):
return t return t
def run_pserver(self, args): def run_pserver(self, args):
import paddle
import paddle.fluid as fluid
self.get_model(batch_size=2) self.get_model(batch_size=2)
if args.mem_opt: # NOTE: pserver should not call memory optimize
fluid.memory_optimize(fluid.default_main_program())
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), args.endpoints, fluid.default_main_program(), args.endpoints,
args.trainers, args.sync_mode) args.trainers, args.sync_mode)
...@@ -61,29 +63,34 @@ class TestDistRunnerBase(object): ...@@ -61,29 +63,34 @@ class TestDistRunnerBase(object):
exe.run(startup_prog) exe.run(startup_prog)
exe.run(pserver_prog) exe.run(pserver_prog)
def run_trainer(self, use_cuda, args): def run_trainer(self, args):
import paddle
import paddle.fluid as fluid
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2) self.get_model(batch_size=2)
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist: if args.is_dist:
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), fluid.default_main_program(),
args.endpoints, args.trainers, args.endpoints, args.trainers,
args.sync_mode) args.sync_mode)
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
else: else:
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program()) startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1 strategy.num_threads = 1
strategy.allow_op_delay = False strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy() build_stra = fluid.BuildStrategy()
if args.use_reduce: if args.use_reduce:
...@@ -92,7 +99,7 @@ class TestDistRunnerBase(object): ...@@ -92,7 +99,7 @@ class TestDistRunnerBase(object):
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
use_cuda, args.use_cuda,
loss_name=avg_cost.name, loss_name=avg_cost.name,
exec_strategy=strategy, exec_strategy=strategy,
build_strategy=build_stra) build_strategy=build_stra)
...@@ -103,27 +110,26 @@ class TestDistRunnerBase(object): ...@@ -103,27 +110,26 @@ class TestDistRunnerBase(object):
] ]
feeder = fluid.DataFeeder(feed_var_list, place) feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = test_reader() reader_generator = train_reader()
data = next(reader_generator)
first_loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(data))
print(first_loss)
for i in six.moves.xrange(5): def get_data():
data = next(reader_generator) origin_batch = next(reader_generator)
loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) if args.is_dist and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
new_batch.append(item)
return new_batch
else:
return origin_batch
data = next(reader_generator) for _ in six.moves.xrange(RUN_STEP):
last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) loss, = exe.run(fetch_list=[avg_cost.name],
print(last_loss) feed=feeder.feed(get_data()))
print(loss)
def runtime_main(test_class): def runtime_main(test_class):
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
parser = argparse.ArgumentParser(description='Run dist test.') parser = argparse.ArgumentParser(description='Run dist test.')
parser.add_argument( parser.add_argument(
'--role', type=str, required=True, choices=['pserver', 'trainer']) '--role', type=str, required=True, choices=['pserver', 'trainer'])
...@@ -135,7 +141,10 @@ def runtime_main(test_class): ...@@ -135,7 +141,10 @@ def runtime_main(test_class):
'--current_endpoint', type=str, required=False, default="") '--current_endpoint', type=str, required=False, default="")
parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--sync_mode', action='store_true')
parser.add_argument('--mem_opt', action='store_true') parser.add_argument('--mem_opt', action='store_true')
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--use_reduce', action='store_true')
parser.add_argument(
'--use_reader_alloc', action='store_true', required=False, default=True)
args = parser.parse_args() args = parser.parse_args()
...@@ -143,8 +152,7 @@ def runtime_main(test_class): ...@@ -143,8 +152,7 @@ def runtime_main(test_class):
if args.role == "pserver" and args.is_dist: if args.role == "pserver" and args.is_dist:
model.run_pserver(args) model.run_pserver(args)
else: else:
use_cuda = True if core.is_compiled_with_cuda() else False model.run_trainer(args)
model.run_trainer(use_cuda, args)
import paddle.compat as cpt import paddle.compat as cpt
...@@ -163,8 +171,10 @@ class TestDistBase(unittest.TestCase): ...@@ -163,8 +171,10 @@ class TestDistBase(unittest.TestCase):
self._find_free_port(), self._find_free_port()) self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True self._sync_mode = True
self._use_cuda = True
self._mem_opt = False self._mem_opt = False
self._use_reduce = False self._use_reduce = False
self._use_reader_alloc = True
self._setup_config() self._setup_config()
def _find_free_port(self): def _find_free_port(self):
...@@ -172,15 +182,15 @@ class TestDistBase(unittest.TestCase): ...@@ -172,15 +182,15 @@ class TestDistBase(unittest.TestCase):
s.bind(('', 0)) s.bind(('', 0))
return s.getsockname()[1] return s.getsockname()[1]
def start_pserver(self, model_file, check_error_log): def start_pserver(self, model_file, check_error_log, required_envs):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
ps0_cmd = ps_cmd % \ ps0_cmd = ps_cmd % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers)
ps1_cmd = ps_cmd % \ ps1_cmd = ps_cmd % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers)
if self._sync_mode: if self._sync_mode:
ps0_cmd += " --sync_mode" ps0_cmd += " --sync_mode"
...@@ -198,9 +208,15 @@ class TestDistBase(unittest.TestCase): ...@@ -198,9 +208,15 @@ class TestDistBase(unittest.TestCase):
ps1_pipe = open("/tmp/ps1_err.log", "wb") ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen( ps0_proc = subprocess.Popen(
ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) ps0_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=ps0_pipe,
env=required_envs)
ps1_proc = subprocess.Popen( ps1_proc = subprocess.Popen(
ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) ps1_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=ps1_pipe,
env=required_envs)
if not check_error_log: if not check_error_log:
return ps0_proc, ps1_proc, None, None return ps0_proc, ps1_proc, None, None
...@@ -222,59 +238,60 @@ class TestDistBase(unittest.TestCase): ...@@ -222,59 +238,60 @@ class TestDistBase(unittest.TestCase):
(e, retry_times)) (e, retry_times))
retry_times -= 1 retry_times -= 1
def check_with_place(self, model_file, delta=1e-3, check_error_log=False): def _run_local(self, model, envs, check_error_log):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
"CPU_NUM": "1"
}
if check_error_log: cmd = "%s %s --role trainer" % (self._python_interp, model)
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1" if self._use_cuda:
cmd += " --use_cuda"
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
else:
env_local = {'CPU_NUM': '1'}
envs.update(env_local)
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
if not check_error_log: if not check_error_log:
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
local_cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=err_log,
env=env_local) env=envs)
else: else:
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
local_cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=err_log, stderr=subprocess.PIPE,
env=env_local) env=envs)
local_proc.wait() local_proc.wait()
out, err = local_proc.communicate() local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(out) local_ret = cpt.to_text(local_out)
sys.stderr.write('local_loss: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % err) if check_error_log:
err_log.close()
sys.stderr.write('local_stdout: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % local_err)
local_losses = local_ret.split("\n")
return local_losses
def _run_cluster(self, model, envs, check_error_log):
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log) check_error_log, envs)
self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid) self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
tr0_cmd = tr_cmd % \ tr0_cmd = tr_cmd % \
(self._python_interp, model_file, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
0, ps0_ep, self._trainers) 0, ps0_ep, self._trainers)
tr1_cmd = tr_cmd % \ tr1_cmd = tr_cmd % \
(self._python_interp, model_file, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
1, ps1_ep, self._trainers) 1, ps1_ep, self._trainers)
if self._sync_mode: if self._sync_mode:
tr0_cmd += " --sync_mode" tr0_cmd += " --sync_mode"
...@@ -285,18 +302,28 @@ class TestDistBase(unittest.TestCase): ...@@ -285,18 +302,28 @@ class TestDistBase(unittest.TestCase):
if self._use_reduce: if self._use_reduce:
tr0_cmd += " --use_reduce" tr0_cmd += " --use_reduce"
tr1_cmd += " --use_reduce" tr1_cmd += " --use_reduce"
if self._use_reader_alloc:
tr0_cmd += " --use_reader_alloc"
tr1_cmd += " --use_reader_alloc"
if self._use_cuda:
tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
else:
env0 = {'CPU_NUM': '1'}
env1 = {'CPU_NUM': '1'}
env0.update(envs)
env1.update(envs)
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
FNULL = open(os.devnull, 'w') FNULL = open(os.devnull, 'w')
tr0_pipe = subprocess.PIPE tr0_pipe = subprocess.PIPE
tr1_pipe = subprocess.PIPE tr1_pipe = subprocess.PIPE
if check_error_log: if check_error_log:
print("tr0_cmd:", tr0_cmd) print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
print("tr1_cmd:", tr1_cmd) print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb") tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb")
...@@ -313,17 +340,11 @@ class TestDistBase(unittest.TestCase): ...@@ -313,17 +340,11 @@ class TestDistBase(unittest.TestCase):
tr0_proc.wait() tr0_proc.wait()
tr1_proc.wait() tr1_proc.wait()
out, err = tr0_proc.communicate()
sys.stderr.write('dist_stderr: %s\n' % err) tr0_out, tr0_err = tr0_proc.communicate()
loss_data0 = cpt.to_text(out) tr0_loss_text = cpt.to_text(tr0_out)
sys.stderr.write('dist_loss: %s\n' % loss_data0) tr1_out, tr1_err = tr1_proc.communicate()
lines = loss_data0.split("\n") tr1_loss_text = cpt.to_text(tr1_out)
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
# close trainer file # close trainer file
if check_error_log: if check_error_log:
...@@ -341,5 +362,47 @@ class TestDistBase(unittest.TestCase): ...@@ -341,5 +362,47 @@ class TestDistBase(unittest.TestCase):
ps1.wait() ps1.wait()
FNULL.close() FNULL.close()
self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) # print log
self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
tr0_losses = tr0_loss_text.split("\n")
tr1_losses = tr1_loss_text.split("\n")
return tr0_losses, tr1_losses
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
local_losses\
= self._run_local(model_file, required_envs,
check_error_log)
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
check_error_log)
for step_id in range(RUN_STEP):
local_loss = eval(local_losses[step_id])[0]
tr0_loss = eval(tr0_losses[step_id])[0]
tr1_loss = eval(tr1_losses[step_id])[0]
dist_loss = (tr0_loss + tr1_loss) / 2
print(str(local_loss) + ":" + str(dist_loss))
self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistCTR2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
if __name__ == "__main__":
unittest.main()
...@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase): ...@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase):
self._use_reduce = False self._use_reduce = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnist2x2WithMemopt(TestDistBase): class TestDistMnist2x2WithMemopt(TestDistBase):
...@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): ...@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._mem_opt = True self._mem_opt = True
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnistAsync(TestDistBase): class TestDistMnistAsync(TestDistBase):
......
...@@ -20,24 +20,25 @@ from test_dist_base import TestDistBase ...@@ -20,24 +20,25 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=100)
# TODO(typhoonzero): fix this test class TestDistseResnXt2x2WithMemopt(TestDistBase):
# class TestDistseResnXt2x2WithMemopt(TestDistBase): def _setup_config(self):
# def _setup_config(self): self._sync_mode = True
# self._sync_mode = True self._mem_opt = True
# self._mem_opt = True
# def test_dist_train(self): def test_dist_train(self):
# self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=100)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=100)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistSimnetBowDense2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2DenseAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBowSparse2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2SparseAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistTextClassification2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_text_classification(self):
self.check_with_place("dist_text_classification.py", delta=1e-6)
class TestDistTextClassification2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_se_resnext(self):
self.check_with_place("dist_text_classification.py", delta=100)
if __name__ == "__main__":
unittest.main()
...@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest): ...@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest):
]) ])
class TestDecayedAdagrad(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
opt.minimize(avg_cost)
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer, _ = self.get_trainer()
class TestLRDecayConditional(TranspilerTest): class TestLRDecayConditional(TranspilerTest):
def net_conf(self): def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32') x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
......
...@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase): ...@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase):
self._sync_mode = False self._sync_mode = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1) self.check_with_place("dist_word2vec.py", delta=100)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest): ...@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest):
'eta': self.eta 'eta': self.eta
} }
print("lod = ", self.lod)
self.outputs = { self.outputs = {
'RpnRois': (self.rpn_rois[0], [self.lod]), 'RpnRois': (self.rpn_rois[0], [self.lod]),
'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]) 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
...@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest): ...@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest):
self.post_nms_topN = 5000 # train 6000, test 1000 self.post_nms_topN = 5000 # train 6000, test 1000
self.nms_thresh = 0.7 self.nms_thresh = 0.7
self.min_size = 3.0 self.min_size = 3.0
self.eta = 0.8 self.eta = 1.
def init_test_input(self): def init_test_input(self):
batch_size = 1 batch_size = 1
......
...@@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase): ...@@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase):
mul_op_desc.set_input("X", ["x"]) mul_op_desc.set_input("X", ["x"])
mul_op_desc.set_input("Y", ["y"]) mul_op_desc.set_input("Y", ["y"])
mul_op_desc.set_output("Out", ["out"]) mul_op_desc.set_output("Out", ["out"])
mul_op_desc.set_attr("x_num_col_dims", 1) mul_op_desc._set_attr("x_num_col_dims", 1)
mul_op_desc.set_attr("y_num_col_dims", 1) mul_op_desc._set_attr("y_num_col_dims", 1)
mul_op_desc.check_attrs() mul_op_desc.check_attrs()
mul_op_desc.infer_shape(block) mul_op_desc.infer_shape(block)
......
...@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase): ...@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase):
with program_guard(program): with program_guard(program):
input = layers.data( input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32") name="input", shape=[3, 100, 100], dtype="float32")
out = layers.shape(input, name="shape") out = layers.shape(input)
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
...@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase): ...@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase):
out = layers.expand(x, [1, 2]) out = layers.expand(x, [1, 2])
print(str(program)) print(str(program))
def test_uniform_random_batch_size_like(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.uniform_random_batch_size_like(input, [-1, 11])
self.assertIsNotNone(out)
print(str(program))
def test_gaussian_random(self):
program = Program()
with program_guard(program):
out = layers.gaussian_random(shape=[20, 30])
self.assertIsNotNone(out)
print(str(program))
def test_sampling_id(self):
program = Program()
with program_guard(program):
x = layers.data(
name="X",
shape=[13, 11],
dtype='float32',
append_batch_size=False)
out = layers.sampling_id(x)
self.assertIsNotNone(out)
print(str(program))
def test_gaussian_random_batch_size_like(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.gaussian_random_batch_size_like(
input, shape=[-1, 11], mean=1.0, std=2.0)
self.assertIsNotNone(out)
print(str(program))
def test_sum(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.sum(input)
self.assertIsNotNone(out)
print(str(program))
def test_slice(self):
starts = [1, 0, 2]
ends = [3, 3, 4]
axes = [0, 1, 2]
program = Program()
with program_guard(program):
input = layers.data(
name="input", shape=[3, 4, 5, 6], dtype='float32')
out = layers.slice(input, axes=axes, starts=starts, ends=ends)
def test_softshrink(self): def test_softshrink(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import unittest
import os
import sys
import math
def simple_fc_net():
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestPassBuilder(unittest.TestCase):
def check_network_convergence(self, use_cuda, build_strategy=None):
os.environ['CPU_NUM'] = str(4)
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net()
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=loss.name,
main_program=main,
build_strategy=build_strategy)
test_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=test_program,
share_vars_from=train_exe,
build_strategy=build_strategy)
for i in range(5):
test_loss, = test_exe.run([loss.name], feed=feed_dict)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss))
def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy()
pass_builder = build_strategy._create_passes_from_strategy()
origin_len = len(pass_builder.all_passes())
viz_pass = pass_builder.append_pass("graph_viz_pass")
self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
pass_builder.insert_pass(
len(pass_builder.all_passes()), "graph_viz_pass")
self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
self.check_network_convergence(
use_cuda=core.is_compiled_with_cuda(),
build_strategy=build_strategy)
try:
os.stat("/tmp/test_viz_pass")
except os.error:
self.assertFalse(True)
if __name__ == '__main__':
unittest.main()
...@@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase): ...@@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(['z'], op.output("Out")) self.assertEqual(['z'], op.output("Out"))
self.assertEqual(["Out"], op.output_names()) self.assertEqual(["Out"], op.output_names())
op.set_attr("int_attr", 1) op._set_attr("int_attr", 1)
self.assertEqual(1, op.attr("int_attr")) self.assertEqual(1, op.attr("int_attr"))
self.assertTrue(op.has_attr("int_attr")) self.assertTrue(op.has_attr("int_attr"))
self.assertEqual(core.AttrType.INT, op.attr_type("int_attr")) self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
op.set_attr("float_attr", -1.32) op._set_attr("float_attr", -1.32)
self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4) self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
self.assertTrue(op.has_attr("float_attr")) self.assertTrue(op.has_attr("float_attr"))
op.set_attr("bool_attr", False) op._set_attr("bool_attr", False)
self.assertFalse(op.attr("bool_attr")) self.assertFalse(op.attr("bool_attr"))
op.set_attr("string_attr", "abc") op._set_attr("string_attr", "abc")
self.assertEqual("abc", op.attr("string_attr")) self.assertEqual("abc", op.attr("string_attr"))
self.assertTrue(op.has_attr("string_attr")) self.assertTrue(op.has_attr("string_attr"))
op.set_attr("ints_attr", [1, 2, 3]) op._set_attr("ints_attr", [1, 2, 3])
self.assertEqual([1, 2, 3], op.attr("ints_attr")) self.assertEqual([1, 2, 3], op.attr("ints_attr"))
expected = [1.2, 2.3, 3.4] expected = [1.2, 2.3, 3.4]
op.set_attr("floats_attr", expected) op._set_attr("floats_attr", expected)
for e, a in zip(expected, op.attr("floats_attr")): for e, a in zip(expected, op.attr("floats_attr")):
self.assertAlmostEqual(e, a, delta=1e-4) self.assertAlmostEqual(e, a, delta=1e-4)
op.set_attr("strings_attr", ["a", "b", "c"]) op._set_attr("strings_attr", ["a", "b", "c"])
self.assertEqual(["a", "b", "c"], op.attr("strings_attr")) self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))
op.set_attr("bools_attr", [True, False, True]) op._set_attr("bools_attr", [True, False, True])
self.assertEqual([True, False, True], op.attr("bools_attr")) self.assertEqual([True, False, True], op.attr("bools_attr"))
self.assertEqual(8, len(op.attr_names())) self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", program_desc.block(0)) op.set_block_attr("_block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr_id("block_attr")) self.assertEqual(0, op._block_attr_id("_block_attr"))
mul_op = block.append_op() mul_op = block.append_op()
mul_op.set_type("mul") mul_op.set_type("mul")
......
...@@ -246,6 +246,7 @@ def prepare_encoder(src_word, ...@@ -246,6 +246,7 @@ def prepare_encoder(src_word,
padding_idx=pos_pad_idx, padding_idx=pos_pad_idx,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=pos_enc_param_name, trainable=False)) name=pos_enc_param_name, trainable=False))
src_pos_enc.stop_gradient = True
enc_input = src_word_emb + src_pos_enc enc_input = src_word_emb + src_pos_enc
# FIXME(guosheng): Decouple the program desc with batch_size. # FIXME(guosheng): Decouple the program desc with batch_size.
......
...@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory ...@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
from .ps_dispatcher import HashName, RoundRobin from .ps_dispatcher import HashName, RoundRobin
__all__ = [ __all__ = [
"DistributeTranspiler", "memory_optimize", "release_memory", "HashName", "DistributeTranspiler",
"RoundRobin", "DistributeTranspilerConfig" "memory_optimize",
"release_memory",
"HashName",
"RoundRobin",
"DistributeTranspilerConfig",
] ]
...@@ -128,7 +128,7 @@ def op_to_code(op): ...@@ -128,7 +128,7 @@ def op_to_code(op):
attr_type = op.desc.attr_type(name) attr_type = op.desc.attr_type(name)
if attr_type == core.AttrType.BLOCK: if attr_type == core.AttrType.BLOCK:
a = "{name} = block[{value}]".format( a = "{name} = block[{value}]".format(
name=name, type=attr_type, value=op.block_attr_id(name)) name=name, type=attr_type, value=op._block_attr_id(name))
attrs_str += a attrs_str += a
if i != len(attr_names) - 1: if i != len(attr_names) - 1:
attrs_str += ", " attrs_str += ", "
...@@ -136,7 +136,7 @@ def op_to_code(op): ...@@ -136,7 +136,7 @@ def op_to_code(op):
if attr_type == core.AttrType.BLOCKS: if attr_type == core.AttrType.BLOCKS:
a = "{name} = blocks{value}".format( a = "{name} = blocks{value}".format(
name=name, type=attr_type, value=op.blocks_attr_ids(name)) name=name, type=attr_type, value=op._blocks_attr_ids(name))
attrs_str += a attrs_str += a
if i != len(attr_names) - 1: if i != len(attr_names) - 1:
attrs_str += ", " attrs_str += ", "
......
...@@ -39,8 +39,8 @@ import six ...@@ -39,8 +39,8 @@ import six
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, Block, \ default_startup_program, Block, \
Parameter, grad_var_name Parameter, grad_var_name
from .details import * from .details import *
from functools import reduce from functools import reduce
...@@ -178,7 +178,7 @@ class DistributeTranspiler(object): ...@@ -178,7 +178,7 @@ class DistributeTranspiler(object):
pserver_program) pserver_program)
elif role == "TRAINER": elif role == "TRAINER":
trainer_program = t.get_trainer_program() trainer_program = t.get_trainer_program()
# for nccl2 mode # for nccl2 mode
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2" config.mode = "nccl2"
...@@ -470,7 +470,10 @@ class DistributeTranspiler(object): ...@@ -470,7 +470,10 @@ class DistributeTranspiler(object):
""" """
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
lr_ops = self._get_lr_ops()
delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
delete_ops(self.origin_program.global_block(), lr_ops)
self.origin_program.__str__() self.origin_program.__str__()
if wait_port: if wait_port:
...@@ -534,7 +537,7 @@ class DistributeTranspiler(object): ...@@ -534,7 +537,7 @@ class DistributeTranspiler(object):
}) })
for varname, splited_var in six.iteritems(self.param_var_mapping): for varname, splited_var in six.iteritems(self.param_var_mapping):
#add concat ops to merge splited parameters received from parameter servers. # add concat ops to merge splited parameters received from parameter servers.
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
# NOTE: if enable memory optimization, origin vars maybe removed. # NOTE: if enable memory optimization, origin vars maybe removed.
...@@ -668,7 +671,7 @@ in a single call.") ...@@ -668,7 +671,7 @@ in a single call.")
__clone_lr_op_sub_block__(cloned_op, program, new_sub_block) __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
# reset the block of op # reset the block of op
op.set_attr('sub_block', new_sub_block) op._set_attr('sub_block', new_sub_block)
# append lr decay ops to the child block if exists # append lr decay ops to the child block if exists
lr_ops = self._get_lr_ops() lr_ops = self._get_lr_ops()
...@@ -734,19 +737,14 @@ in a single call.") ...@@ -734,19 +737,14 @@ in a single call.")
table_opt_block = self._create_table_optimize_block( table_opt_block = self._create_table_optimize_block(
pserver_index, pserver_program, pre_block_idx, grad_to_block_id) pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
optimize_blocks.append(table_opt_block) optimize_blocks.append(table_opt_block)
prefetch_var_name_to_block_id = self._create_prefetch_block( lookup_table_var_name_to_block_id = self._create_prefetch_block(
pserver_index, pserver_program, table_opt_block) pserver_index, pserver_program, table_opt_block)
checkpoint_block_id = self._create_checkpoint_save_block( checkpoint_block_id = self._create_checkpoint_save_block(
pserver_program, table_opt_block.idx) pserver_program, table_opt_block.idx)
pserver_program._distributed_lookup_table = self.table_name pserver_program._distributed_lookup_table = self.table_name
prefetch_var_name_to_block_id.extend(
# NOTE: if has_distributed_lookup_table is False, then prefetch_block will lookup_table_var_name_to_block_id)
# not be executed, so it's safe to use optimize_block to hold the place
if self.has_distributed_lookup_table:
assert len(prefetch_var_name_to_block_id) > 0
else:
assert len(prefetch_var_name_to_block_id) == 0
attrs = { attrs = {
"optimize_blocks": optimize_blocks, "optimize_blocks": optimize_blocks,
...@@ -755,11 +753,14 @@ in a single call.") ...@@ -755,11 +753,14 @@ in a single call.")
"sync_mode": self.sync_mode, "sync_mode": self.sync_mode,
"grad_to_block_id": grad_to_block_id, "grad_to_block_id": grad_to_block_id,
} }
if len(prefetch_var_name_to_block_id) > 0:
attrs['prefetch_var_name_to_block_id'] \ if self.has_distributed_lookup_table:
= prefetch_var_name_to_block_id
attrs['checkpint_block_id'] = checkpoint_block_id attrs['checkpint_block_id'] = checkpoint_block_id
if len(prefetch_var_name_to_block_id) > 0:
attrs[
'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
# step5 append the listen_and_serv op # step5 append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="listen_and_serv", type="listen_and_serv",
...@@ -864,7 +865,7 @@ to transpile() call.") ...@@ -864,7 +865,7 @@ to transpile() call.")
if op.type in [ if op.type in [
"gaussian_random", "fill_constant", "uniform_random" "gaussian_random", "fill_constant", "uniform_random"
]: ]:
op.set_attr("shape", list(new_outputs["Out"].shape)) op._set_attr("shape", list(new_outputs["Out"].shape))
s_prog.global_block().append_op( s_prog.global_block().append_op(
type=op.type, type=op.type,
inputs=new_inputs, inputs=new_inputs,
...@@ -1013,7 +1014,7 @@ to transpile() call.") ...@@ -1013,7 +1014,7 @@ to transpile() call.")
for g, p in zip(grad_blocks, param_blocks): for g, p in zip(grad_blocks, param_blocks):
g_name, g_bid, _ = g.split(":") g_name, g_bid, _ = g.split(":")
p_name, p_bid, _ = p.split(":") p_name, p_bid, _ = p.split(":")
self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
self.param_var_mapping[p_name][int(p_bid)] self.param_var_mapping[p_name][int(p_bid)]
# create mapping of endpoint -> split var to create pserver side program # create mapping of endpoint -> split var to create pserver side program
...@@ -1320,7 +1321,7 @@ to transpile() call.") ...@@ -1320,7 +1321,7 @@ to transpile() call.")
if len(splited) == 1: if len(splited) == 1:
if self.sync_mode and add_trainer_suffix: if self.sync_mode and add_trainer_suffix:
new_var_name = "%s.trainer_%d" % \ new_var_name = "%s.trainer_%d" % \
(orig_var.name, self.trainer_id) (orig_var.name, self.trainer_id)
program.global_block()._rename_var(varname, new_var_name) program.global_block()._rename_var(varname, new_var_name)
var_mapping[varname] = \ var_mapping[varname] = \
[program.global_block().var(new_var_name)] [program.global_block().var(new_var_name)]
...@@ -1343,10 +1344,10 @@ to transpile() call.") ...@@ -1343,10 +1344,10 @@ to transpile() call.")
new_var_name = "" new_var_name = ""
if self.sync_mode and add_trainer_suffix: if self.sync_mode and add_trainer_suffix:
new_var_name = "%s.block%d.trainer_%d" % \ new_var_name = "%s.block%d.trainer_%d" % \
(varname, i, self.trainer_id) (varname, i, self.trainer_id)
else: else:
new_var_name = "%s.block%d" % \ new_var_name = "%s.block%d" % \
(varname, i) (varname, i)
var = program.global_block().create_var( var = program.global_block().create_var(
name=new_var_name, name=new_var_name,
persistable=False, persistable=False,
...@@ -1430,6 +1431,9 @@ to transpile() call.") ...@@ -1430,6 +1431,9 @@ to transpile() call.")
elif op_type == "rmsprop": elif op_type == "rmsprop":
if varkey in ["Moment", "MeanSquare"]: if varkey in ["Moment", "MeanSquare"]:
return param_shape return param_shape
elif op_type == "decayed_adagrad":
if varkey == "Moment":
return param_shape
elif op_type == "sgd": elif op_type == "sgd":
pass pass
return orig_shape return orig_shape
...@@ -1484,9 +1488,8 @@ to transpile() call.") ...@@ -1484,9 +1488,8 @@ to transpile() call.")
vars2merge = [] vars2merge = []
for i in range(self.trainer_num): for i in range(self.trainer_num):
per_trainer_name = "%s.trainer_%d" % \ per_trainer_name = "%s.trainer_%d" % \
(merged_var_name, i) (merged_var_name, i)
vars2merge.append(pserver_block.vars[per_trainer_name]) vars2merge.append(pserver_block.vars[per_trainer_name])
optimize_block.append_op( optimize_block.append_op(
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
...@@ -1645,7 +1648,7 @@ to transpile() call.") ...@@ -1645,7 +1648,7 @@ to transpile() call.")
# one op's output is another op's input, we say # one op's output is another op's input, we say
# the two operator is connected. # the two operator is connected.
if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \ if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
return True return True
return False return False
...@@ -1662,7 +1665,7 @@ to transpile() call.") ...@@ -1662,7 +1665,7 @@ to transpile() call.")
def _is_optimizer_op(self, op): def _is_optimizer_op(self, op):
if "Param" in op.input_names and \ if "Param" in op.input_names and \
"LearningRate" in op.input_names: "LearningRate" in op.input_names:
return True return True
return False return False
...@@ -1737,7 +1740,7 @@ to transpile() call.") ...@@ -1737,7 +1740,7 @@ to transpile() call.")
# NOTE: we need to skip all optimize ops, since it is connected # NOTE: we need to skip all optimize ops, since it is connected
# with forward/backward ops and lr ops, we only need the lr ops. # with forward/backward ops and lr ops, we only need the lr ops.
if op1 != op2 and self._is_op_connected(op1, op2) and \ if op1 != op2 and self._is_op_connected(op1, op2) and \
not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
ufind.union(op1, op2) ufind.union(op1, op2)
# find all ops which is related with lr var # find all ops which is related with lr var
for op1 in block.ops: for op1 in block.ops:
......
...@@ -163,7 +163,7 @@ class InferenceTranspiler(object): ...@@ -163,7 +163,7 @@ class InferenceTranspiler(object):
next_op = self.block.ops[i + 1] next_op = self.block.ops[i + 1]
if next_op.type == 'relu': if next_op.type == 'relu':
# modify bnorm OP to include relu # modify bnorm OP to include relu
current_op.set_attr("fuse_with_relu", True) current_op._set_attr("fuse_with_relu", True)
# remove relu OP # remove relu OP
self.block._remove_op(i + 1) self.block._remove_op(i + 1)
i = i + 1 i = i + 1
...@@ -377,7 +377,7 @@ class InferenceTranspiler(object): ...@@ -377,7 +377,7 @@ class InferenceTranspiler(object):
type=old_var.type, type=old_var.type,
dtype=old_var.dtype, dtype=old_var.dtype,
shape=old_var.shape) shape=old_var.shape)
op.rename_input(old_param_name, new_param_name) op._rename_input(old_param_name, new_param_name)
self.scope.var(new_param_name) self.scope.var(new_param_name)
tensor = self.scope.find_var(new_param_name).get_tensor() tensor = self.scope.find_var(new_param_name).get_tensor()
...@@ -463,8 +463,8 @@ class InferenceTranspiler(object): ...@@ -463,8 +463,8 @@ class InferenceTranspiler(object):
current_op = self.block.ops[i] current_op = self.block.ops[i]
for input_arg in current_op.input_arg_names: for input_arg in current_op.input_arg_names:
if input_arg in self.input_map: if input_arg in self.input_map:
current_op.rename_input(input_arg, current_op._rename_input(input_arg,
self.input_map[input_arg]) self.input_map[input_arg])
def _remove_unused_var(self): def _remove_unused_var(self):
''' '''
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
from __future__ import print_function from __future__ import print_function
from collections import defaultdict, OrderedDict, Callable from collections import defaultdict, MutableSet
from .. import core from .. import core
from ... import compat as cpt from ... import compat as cpt
from ..framework import Program, default_main_program, Parameter, Variable from ..framework import Program, default_main_program, Parameter, Variable, core
from ..backward import _rename_arg_ from ..backward import _rename_arg_
from functools import reduce from functools import reduce
from six.moves import range from six.moves import range
...@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), ...@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
PRINT_LOG = False PRINT_LOG = False
class OrderedSet(MutableSet):
def __init__(self, iterable=None):
self.end = end = []
end += [None, end, end] # sentinel node for doubly linked list
self.map = {} # key --> [key, prev, next]
if iterable is not None:
self |= iterable
def __len__(self):
return len(self.map)
def __contains__(self, key):
return key in self.map
def add(self, key):
if key not in self.map:
end = self.end
curr = end[1]
curr[2] = end[1] = self.map[key] = [key, curr, end]
def update(self, other):
for e in other:
self.add(e)
def discard(self, key):
if key in self.map:
key, prev, next = self.map.pop(key)
prev[2] = next
next[1] = prev
def remove(self, key):
self.discard(key)
def __iter__(self):
end = self.end
curr = end[2]
while curr is not end:
yield curr[0]
curr = curr[2]
def __reversed__(self):
end = self.end
curr = end[1]
while curr is not end:
yield curr[0]
curr = curr[1]
def pop(self, last=True):
if not self:
raise KeyError('set is empty')
key = self.end[1][0] if last else self.end[2][0]
self.discard(key)
return key
def __repr__(self):
if not self:
return '%s()' % (self.__class__.__name__, )
return '%s(%r)' % (self.__class__.__name__, list(self))
def __eq__(self, other):
if isinstance(other, OrderedSet):
return len(self) == len(other) and list(self) == list(other)
return set(self) == set(other)
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, program, ops, forward_num, skip_opt): def __init__(self, program, ops, forward_num, skip_opt):
self._program = program self._program = program
self._ops = ops self._ops = ops
self._forward_num = forward_num self._forward_num = forward_num
self._successors = defaultdict(set) self._successors = defaultdict(OrderedSet)
self._presuccessors = defaultdict(set) self._presuccessors = defaultdict(OrderedSet)
self._uses = defaultdict(set) self._uses = defaultdict(OrderedSet)
self._defs = defaultdict(set) self._defs = defaultdict(OrderedSet)
self._live_in = defaultdict(set) self._live_in = defaultdict(OrderedSet)
self._live_out = defaultdict(set) self._live_out = defaultdict(OrderedSet)
self._skip_opt = skip_opt self._skip_opt = skip_opt
self.pool = [] self.pool = []
...@@ -116,7 +181,7 @@ class ControlFlowGraph(object): ...@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
# NOTE: must sort the in_diff set for cases that get different cache var. # NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this. # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize = [ can_optimize = [
x for x in sorted(list(in_diff)) x for x in in_diff
if self._check_var_validity(block_desc, x, is_forward) if self._check_var_validity(block_desc, x, is_forward)
] ]
if can_optimize: if can_optimize:
...@@ -224,7 +289,7 @@ class ControlFlowGraph(object): ...@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
if self.pool: if self.pool:
# NOTE: must sort the in_diff set for cases that get different cache var. # NOTE: must sort the in_diff set for cases that get different cache var.
defs_can_optimize = [ defs_can_optimize = [
x for x in sorted(list(self._defs[i])) x for x in self._defs[i]
if self._check_var_validity(block_desc, x, is_forward) if self._check_var_validity(block_desc, x, is_forward)
] ]
out_pair = [ out_pair = [
...@@ -381,7 +446,19 @@ def _get_cfgs(input_program): ...@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
return cfgs return cfgs
def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): def _is_opt_role_op(op):
op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True
def memory_optimize(input_program,
skip_opt_set=None,
print_log=False,
level=0,
skip_grads=False):
"""Optimize memory by reusing var memory. """Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock. Note: it doesn't not support subblock nested in subblock.
...@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): ...@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
raise ValueError("only support opt_level 0 or 1.") raise ValueError("only support opt_level 0 or 1.")
global PRINT_LOG global PRINT_LOG
PRINT_LOG = print_log PRINT_LOG = print_log
if skip_grads:
grad_set = set()
OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
for op in input_program.global_block().ops:
if _is_opt_role_op(op):
if op.attr(OP_ROLE_VAR):
grad_name = op.attr(OP_ROLE_VAR)[1]
grad_set.add(grad_name)
if not skip_opt_set:
skip_opt_set = grad_set
else:
skip_opt_set.update(grad_set)
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
for cfg in cfgs: for cfg in cfgs:
cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
......
...@@ -106,6 +106,7 @@ packages=['paddle', ...@@ -106,6 +106,7 @@ packages=['paddle',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.transpiler', 'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details'] 'paddle.fluid.transpiler.details']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册