提交 4c891374 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_api

...@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING) ...@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING)
endif() endif()
if(WIN32) if(WIN32)
# windows stupid compile option for all targets. # windows header option for all targets.
add_definitions(-D_XKEYCHECK_H) add_definitions(-D_XKEYCHECK_H)
# Use symbols instead of absolute path, reduce the cmake link command length.
SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
# Specify the program to use when building static libraries
SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
# set defination for the dll export
if (NOT MSVC)
message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
endif(NOT MSVC)
endif(WIN32) endif(WIN32)
if(NOT WITH_GOLANG) if(NOT WITH_GOLANG)
......
...@@ -27,7 +27,6 @@ endfunction() ...@@ -27,7 +27,6 @@ endfunction()
CheckCompilerCXX11Flag() CheckCompilerCXX11Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
# safe_set_flag # safe_set_flag
# #
# Set a compile flag only if compiler is support # Set a compile flag only if compiler is support
...@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name) ...@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name)
endif() endif()
endmacro() endmacro()
macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
if (BUILD_SHARED_LIBS)
return() # if build shared libs, the flags keep same with '/MD'
endif(BUILD_SHARED_LIBS)
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endmacro()
CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
if(NOT UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS)
...@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "") ...@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Common flags. the compiler flag used for C/C++ sources whenever release or debug # Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc. # Do not care if this flag is support for gcc.
# https://github.com/PaddlePaddle/Paddle/issues/12773
if (NOT WIN32)
set(COMMON_FLAGS set(COMMON_FLAGS
-fPIC -fPIC
-fno-omit-frame-pointer -fno-omit-frame-pointer
-Werror
-Wall -Wall
-Wextra -Wextra
-Wnon-virtual-dtor -Wnon-virtual-dtor
...@@ -114,11 +131,6 @@ set(COMMON_FLAGS ...@@ -114,11 +131,6 @@ set(COMMON_FLAGS
-Wno-error=terminate # Warning in PADDLE_ENFORCE -Wno-error=terminate # Warning in PADDLE_ENFORCE
) )
# https://github.com/PaddlePaddle/Paddle/issues/12773
if (NOT WIN32)
list(APPEND COMMON_FLAGS -Werror)
endif()
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
-fPIC -fPIC
-fno-omit-frame-pointer -fno-omit-frame-pointer
...@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS ...@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32)
if (APPLE) if (APPLE)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
# On Mac OS X build fat binaries with x86_64 architectures by default. # On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif() endif()
else() endif(APPLE)
if(LINUX)
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
-Wall -Wall
-Wextra -Wextra
-Werror -Werror
${GPU_COMMON_FLAGS}) ${GPU_COMMON_FLAGS})
endif() endif(LINUX)
if(UNIX AND NOT APPLE) if(UNIX AND NOT APPLE)
# except apple from nix*Os family # except apple from nix*Os family
set(LINUX TRUE) set(LINUX TRUE)
endif(UNIX AND NOT APPLE) endif(UNIX AND NOT APPLE)
foreach(flag ${COMMON_FLAGS}) foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cflag(CMAKE_C_FLAGS ${flag})
safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
endforeach() endforeach()
foreach(flag ${GPU_COMMON_FLAGS}) foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag}) safe_set_nvflag(${flag})
endforeach() endforeach()
if(WIN32)
# windows build turn off warnings.
safe_set_static_flag()
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/W3")
endforeach(flag_var)
endif(WIN32)
add_custom_target(paddle_apis ALL add_custom_target(paddle_apis ALL
DEPENDS paddle_v2_apis paddle_fluid_apis) DEPENDS paddle_v2_apis)
add_custom_target(paddle_docs ALL add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn
paddle_mobile_docs paddle_mobile_docs_cn) paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2) add_subdirectory(v2)
add_subdirectory(fluid)
add_subdirectory(mobile) add_subdirectory(mobile)
...@@ -102,8 +102,8 @@ class Float16Transpiler: ...@@ -102,8 +102,8 @@ class Float16Transpiler:
continue continue
for input_arg in current_op.input_arg_names: for input_arg in current_op.input_arg_names:
if input_arg in self.input_map: if input_arg in self.input_map:
current_op.rename_input(input_arg, current_op._rename_input(input_arg,
self.input_map[input_arg]) self.input_map[input_arg])
def _remove_unused_var(self): def _remove_unused_var(self):
''' '''
...@@ -187,7 +187,7 @@ class Float16Transpiler: ...@@ -187,7 +187,7 @@ class Float16Transpiler:
shape=var.shape, shape=var.shape,
persistable=var.persistable) persistable=var.persistable)
find_op(var) find_op(var)
var.op.rename_output(var_name, tmp_var_name) var.op._rename_output(var_name, tmp_var_name)
self.block._insert_op( self.block._insert_op(
i, i,
type="cast", type="cast",
......
...@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords= ...@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
...@@ -40,7 +23,7 @@ paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wai ...@@ -40,7 +23,7 @@ paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wai
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
...@@ -176,6 +159,13 @@ paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs= ...@@ -176,6 +159,13 @@ paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=
paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...@@ -248,6 +238,12 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar ...@@ -248,6 +238,12 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -286,7 +282,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw ...@@ -286,7 +282,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -315,6 +311,7 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs ...@@ -315,6 +311,7 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
...@@ -329,7 +326,7 @@ paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keyw ...@@ -329,7 +326,7 @@ paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keyw
paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False)) paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
......
...@@ -13,3 +13,5 @@ if(WITH_INFERENCE) ...@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
endif() endif()
add_subdirectory(train)
...@@ -20,79 +20,37 @@ namespace paddle { ...@@ -20,79 +20,37 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Change it to thread safe flags if needed. template <class T>
class ThreadUnsafeOwnershipFlags { class COWPtr {
public: public:
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} typedef std::shared_ptr<T> RefPtr;
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
private: private:
bool flag_; RefPtr m_sp;
};
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public: public:
// Ctor from raw pointer. COWPtr() : m_sp(nullptr) {}
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} explicit COWPtr(T* t) : m_sp(t) {}
// Move methods. Steal ownership from origin const T& Data() const { return *m_sp; }
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() { T* MutableData() {
ownership_.AcquireOwnershipOnce( DetachIfNotUnique();
[this] { payload_.reset(new T(*payload_)); }); return m_sp.get();
return payload_.get();
} }
private: void DetachIfNotUnique() {
// Actual data pointer. T* tmp = m_sp.get();
std::shared_ptr<T> payload_; if (!(tmp == nullptr || m_sp.unique())) {
Detach();
}
}
// Ownership flag. void Detach() {
OwnershipFlags ownership_; T* tmp = m_sp.get();
m_sp = RefPtr(new T(*tmp));
}
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -30,6 +30,14 @@ TEST(COWPtr, all) { ...@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl( ...@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
PDPattern external_pattern, subblock_pattern; PDPattern external_pattern, subblock_pattern;
// Use the following variables to tell whether this model is RNN1.
// This fuse can only works on the RNN1 model.
std::unordered_set<std::string> specified_vars({"data_lod_attention",
"cell_init", "hidden_init",
"data", "week", "minute"});
int count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsVar() && specified_vars.count(node->Name())) {
++count;
}
}
if (count < specified_vars.size()) {
return graph;
}
// Continue to fuse.
FindWhileOp(graph.get()); FindWhileOp(graph.get());
return graph; return graph;
} }
......
...@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* conv_input = gpd.mutable_pattern() auto* conv_input = gpd.mutable_pattern()
->NewNode("conv_relu_mkldnn_fuse/conv_input") ->NewNode("conv_relu_mkldnn_fuse/conv_input")
...@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
Graph* g) { Graph* g) {
VLOG(4) << "handle ConvReLU fuse"; VLOG(4) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op
GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out
GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op
// Create an ConvReLU Node. // Transform Conv node into ConvReLU node.
OpDesc desc; OpDesc* desc = conv->Op();
std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
std::string conv_relu_w_in = conv_weight->Name(); desc->SetAttr("fuse_relu", true);
std::string conv_relu_b_in = conv_bias->Name(); GraphSafeRemoveNodes(graph.get(), {relu, conv_out});
std::string conv_relu_out = relu_out->Name();
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
}
desc.SetAttr("fuse_relu", true);
auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
PADDLE_ENFORCE(subgraph.count(conv_input)); PADDLE_ENFORCE(subgraph.count(conv_input));
IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); IR_NODE_LINK_TO(conv, relu_out);
IR_NODE_LINK_TO(conv_weight, conv_relu_node);
IR_NODE_LINK_TO(conv_bias, conv_relu_node);
IR_NODE_LINK_TO(conv_relu_node, relu_out);
found_conv_relu_count++; found_conv_relu_count++;
}; };
......
...@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) { ...@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) {
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") { if (node->IsOp() && node->Op()->Type() == "conv2d") {
if (node->Op()->HasAttr("use_mkldnn")) { auto* op = node->Op();
bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn")); ASSERT_TRUE(op->HasAttr("use_mkldnn"));
if (use_mkldnn) { EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
if (node->Op()->HasAttr("fuse_relu")) { ASSERT_TRUE(op->HasAttr("fuse_relu"));
bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu")); bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
if (fuse_relu) { if (fuse_relu) {
++conv_relu_count; ++conv_relu_count;
}
}
}
} }
} }
} }
......
...@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
const std::string BatchedCellPreAct = const std::string BatchedCellPreAct =
patterns::UniqueKey("BatchedCellPreAct"); patterns::UniqueKey("BatchedCellPreAct");
const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>(); scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>(); scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>(); scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
...@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
op_desc.SetOutput("BatchedGate", {BatchedGate}); op_desc.SetOutput("BatchedGate", {BatchedGate});
op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
op_desc.SetOutput("BatchedInput", {BatchedInput}); op_desc.SetOutput("BatchedInput", {BatchedInput});
op_desc.SetOutput("CheckedCell", {CheckedCell});
op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr // TODO(TJ): get from attr
......
...@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()( ...@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()(
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter"); ->assert_is_op_input("conv2d", "Filter");
// Bias
auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Bias");
// intermediate variable, will be removed in the IR after fuse. // intermediate variable, will be removed in the IR after fuse.
auto *conv_out_var = pattern->NewNode(conv_out_repr()) auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate() ->AsIntermediate()
...@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()( ...@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()(
->AsOutput() ->AsOutput()
->assert_is_op_output("relu"); ->assert_is_op_output("relu");
conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
.LinksTo({conv_out_var});
relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
return relu_out_var; return relu_out_var;
} }
......
...@@ -379,7 +379,7 @@ struct PatternBase { ...@@ -379,7 +379,7 @@ struct PatternBase {
// op: conv + relu // op: conv + relu
// named nodes: // named nodes:
// conv_input, conv_weight, // conv_input, conv_weight,
// conv_bias, conv_out, conv, // conv_out, conv,
// relu_out, relu // relu_out, relu
struct ConvReLU : public PatternBase { struct ConvReLU : public PatternBase {
ConvReLU(PDPattern* pattern, const std::string& name_scope) ConvReLU(PDPattern* pattern, const std::string& name_scope)
...@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase { ...@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase {
PATTERN_DECL_NODE(relu); PATTERN_DECL_NODE(relu);
// declare variable node's name // declare variable node's name
PATTERN_DECL_NODE(conv_weight); PATTERN_DECL_NODE(conv_weight);
PATTERN_DECL_NODE(conv_bias);
PATTERN_DECL_NODE(conv_out); PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(relu_out); PATTERN_DECL_NODE(relu_out);
}; };
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
......
...@@ -17,10 +17,13 @@ ...@@ -17,10 +17,13 @@
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -28,206 +31,436 @@ namespace paddle { ...@@ -28,206 +31,436 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
class Vector { class Vector {
public: public:
using value_type = T; using value_type = T;
using iterator = typename std::vector<T>::iterator;
using const_iterator = typename std::vector<T>::const_iterator;
// Default ctor. Create empty Vector private:
Vector() { InitEmpty(); } // The actual class to implement vector logic
class VectorData {
public:
VectorData() : flag_(kDataInCPU) {}
VectorData(size_t count, const T &value)
: cpu_(count, value), flag_(kDataInCPU) {}
VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
template <typename U>
explicit VectorData(const std::vector<U> &dat)
: cpu_(dat), flag_(kDataInCPU) {}
~VectorData() {}
VectorData(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
}
// Fill vector with value. The vector size is `count`. VectorData &operator=(const VectorData &o) {
explicit Vector(size_t count, const T &value = T()) { o.ImmutableCPU();
InitEmpty(); cpu_ = o.cpu_;
if (count != 0) { flag_ = kDataInCPU;
resize(count); details::CUDABuffer null;
T *ptr = begin(); gpu_.Swap(null);
for (size_t i = 0; i < count; ++i) { return *this;
ptr[i] = value; }
T &operator[](size_t i) {
MutableCPU();
return cpu_[i];
}
const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_[i];
}
size_t size() const { return cpu_.size(); }
iterator begin() {
MutableCPU();
return cpu_.begin();
}
iterator end() {
MutableCPU();
return cpu_.end();
}
T &front() {
MutableCPU();
return cpu_.front();
}
T &back() {
MutableCPU();
return cpu_.back();
}
const_iterator begin() const {
ImmutableCPU();
return cpu_.begin();
}
const_iterator end() const {
ImmutableCPU();
return cpu_.end();
}
const T &back() const {
ImmutableCPU();
return cpu_.back();
}
T *data() { return &(*this)[0]; }
const T *data() const { return &(*this)[0]; }
const T &front() const {
ImmutableCPU();
return cpu_.front();
}
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
MutableCPU();
cpu_.assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) {
MutableCPU();
cpu_.push_back(elem);
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
MutableCPU();
auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
std::copy(begin, end, out_it);
}
// resize the vector
void resize(size_t size) {
MutableCPU();
cpu_.resize(size);
}
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_);
}
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T *>(ptr);
}
// clear
void clear() {
cpu_.clear();
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { return cpu_.capacity(); }
// reserve data
void reserve(size_t size) const { cpu_.reserve(size); }
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const {
ImmutableCPU();
return cpu_;
}
bool operator==(const VectorData &other) const {
ImmutableCPU();
other.ImmutableCPU();
return cpu_ == other.cpu_;
}
std::mutex &Mutex() const { return mtx_; }
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) {
return nullptr;
} else {
return std::unique_ptr<platform::CUDAPlace>(
new platform::CUDAPlace(gpu_.place_));
} }
} }
}
// Ctor with init_list private:
Vector(std::initializer_list<T> init) { enum DataFlag {
if (init.size() == 0) { kDataInCPU = 0x01,
InitEmpty(); kDataInCUDA = 0x02,
} else { // kDirty means the data has been changed in one device.
InitByIter(init.size(), init.begin(), init.end()); kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::Place(gpu_.place_)));
auto stream = dev_ctx->stream();
void *src = gpu_.data_;
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
stream);
dev_ctx->Wait();
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
} }
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen");
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen.");
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
}
void ImmutableCPU() const {
if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or
// CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; }
bool IsInCUDA() const { return flag_ & kDataInCUDA; }
bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable int flag_;
mutable std::mutex mtx_;
};
public:
// Default ctor. Create empty Vector
Vector() : m_(new VectorData()) {}
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T())
: m_(new VectorData(count, value)) {}
// Ctor with init_list
Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
// implicit cast from std::vector. // implicit cast from std::vector.
template <typename U> template <typename U>
Vector(const std::vector<U> &dat) { // NOLINT Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
}
} }
// Copy ctor // Copy ctor
Vector(const Vector<T> &other) { this->operator=(other); } Vector(const Vector<T> &other) { m_ = other.m_; }
// Copy operator // Copy operator
Vector<T> &operator=(const Vector<T> &other) { Vector<T> &operator=(const Vector<T> &other) {
if (other.size() != 0) { m_ = other.m_;
this->InitByIter(other.size(), other.begin(), other.end());
} else {
InitEmpty();
}
return *this; return *this;
} }
// Move ctor // Move ctor
Vector(Vector<T> &&other) { Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
}
}
// CPU data access method. Mutable. // CPU data access method. Mutable.
T &operator[](size_t i) { T &operator[](size_t i) { return (*m_.MutableData())[i]; }
MutableCPU();
return const_cast<T *>(cpu_vec_.data<T>())[i];
}
// CPU data access method. Immutable. // CPU data access method. Immutable.
const T &operator[](size_t i) const { const T &operator[](size_t i) const { return m_.Data()[i]; }
ImmutableCPU();
return cpu_vec_.data<T>()[i];
}
// std::vector iterator methods. Based on CPU data access method // std::vector iterator methods. Based on CPU data access method
size_t size() const { return size_; } size_t size() const { return m_.Data().size(); }
T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } iterator begin() { return m_.MutableData()->begin(); }
T *end() { iterator end() { return m_.MutableData()->end(); }
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
T &front() { return *begin(); } T &front() { return m_.MutableData()->front(); }
T &back() { T &back() { return m_.MutableData()->back(); }
auto it = end();
--it;
return *it;
}
const T *begin() const { const_iterator begin() const { return m_.Data().begin(); }
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
}
const T *end() const { const_iterator end() const { return m_.Data().end(); }
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
const T *cbegin() const { return begin(); } const_iterator cbegin() const { return begin(); }
const T *cend() const { return end(); } const_iterator cend() const { return end(); }
const T &back() const { const T &back() const { return m_.Data().back(); }
auto it = end();
--it;
return *it;
}
T *data() { return begin(); } T *data() { return m_.MutableData()->data(); }
const T *data() const { return begin(); } const T *data() const { return m_.Data().data(); }
const T &front() const { return *begin(); } const T &front() const { return m_.Data().front(); }
// end of std::vector iterator methods // end of std::vector iterator methods
// assign this from iterator. // assign this from iterator.
// NOTE: the iterator must support `end-begin` // NOTE: the iterator must support `end-begin`
template <typename Iter> template <typename Iter>
void assign(Iter begin, Iter end) { void assign(Iter begin, Iter end) {
InitByIter(end - begin, begin, end); m_.MutableData()->assign(begin, end);
} }
// push_back. If the previous capacity is not enough, the memory will // push_back. If the previous capacity is not enough, the memory will
// double. // double.
void push_back(T elem) { void push_back(T elem) { m_.MutableData()->push_back(elem); }
if (size_ + 1 > capacity()) {
reserve((size_ + 1) << 1);
}
*end() = elem;
++size_;
}
// extend a vector by iterator. // extend a vector by iterator.
// NOTE: the iterator must support end-begin // NOTE: the iterator must support end-begin
template <typename It> template <typename It>
void Extend(It begin, It end) { void Extend(It begin, It end) {
size_t pre_size = size_; m_.MutableData()->Extend(begin, end);
resize(pre_size + (end - begin));
T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
} }
// resize the vector // resize the vector
void resize(size_t size) { void resize(size_t size) {
if (size + 1 <= capacity()) { if (m_.Data().size() != size) {
size_ = size; m_.MutableData()->resize(size);
} else {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
} }
} }
// get cuda ptr. immutable // get cuda ptr. immutable
const T *CUDAData(platform::Place place) const { const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place), {
"CUDA Data must on CUDA place"); auto &mtx = m_.Data().Mutex();
ImmutableCUDA(place); std::lock_guard<std::mutex> guard(mtx);
return cuda_vec_.data<T>(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAData(place);
} }
// get cuda ptr. mutable // get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) { T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place); {
flag_ = kDirty | kDataInCUDA; auto &mtx = m_.Data().Mutex();
return const_cast<T *>(ptr); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAMutableData(place);
} }
// clear // clear
void clear() { void clear() { m_.MutableData()->clear(); }
size_ = 0;
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { size_t capacity() const { return m_.Data().capacity(); }
return cpu_vec_.memory_size() / SizeOfType(typeid(T));
}
// reserve data // reserve data
void reserve(size_t size) { void reserve(size_t size) { m_.Data().reserve(size); }
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
// the unify method to access CPU or CUDA data. immutable. // the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const { const T *Data(platform::Place place) const {
...@@ -248,12 +481,7 @@ class Vector { ...@@ -248,12 +481,7 @@ class Vector {
} }
// implicit cast operator. Vector can be cast to std::vector implicitly. // implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const { operator std::vector<T>() const { return m_.Data(); }
std::vector<T> result;
result.resize(size());
std::copy(begin(), end(), result.begin());
return result;
}
bool operator==(const Vector<T> &other) const { bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false; if (size() != other.size()) return false;
...@@ -267,118 +495,11 @@ class Vector { ...@@ -267,118 +495,11 @@ class Vector {
return true; return true;
} }
private: const void *Handle() const { return &m_.Data(); }
void InitEmpty() {
size_ = 0;
flag_ = kDataInCPU;
}
template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace();
T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++;
}
flag_ = kDataInCPU | kDirty;
size_ = size;
}
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void ImmutableCPU() const {
if (IsDirty() &&
!IsInCPU()) { // If data has been changed in CUDA, or CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; } private:
// Vector is an COW object.
bool IsInCUDA() const { return flag_ & kDataInCUDA; } mutable details::COWPtr<VectorData> m_;
bool IsInCPU() const { return flag_ & kDataInCPU; }
static void WaitPlace(const platform::Place place) {
if (platform::is_gpu_place(place)) {
platform::DeviceContextPool::Instance()
.Get(boost::get<platform::CUDAPlace>(place))
->Wait();
}
}
static T &EmptyDummy() {
static T dummy = T();
return dummy;
}
mutable int flag_;
mutable Tensor cpu_vec_;
mutable Tensor cuda_vec_;
size_t size_;
}; };
#else // PADDLE_WITH_CUDA #else // PADDLE_WITH_CUDA
......
...@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test { ...@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
selected_rows_.reset(new SelectedRows(rows, height)); selected_rows_.reset(new SelectedRows(rows, height));
Tensor* value = selected_rows_->mutable_value(); Tensor* value = selected_rows_->mutable_value();
value->mutable_data<float>( auto* data = value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_); make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
for (int64_t i = 0; i < value->numel(); ++i) {
data[i] = static_cast<float>(i);
}
} }
protected: protected:
...@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ...@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
auto* dst_data = dst_tensor.value().data<float>();
for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
ASSERT_EQ(dst_data[i], static_cast<float>(i));
}
} }
TEST(SelectedRows, SparseTable) { TEST(SelectedRows, SparseTable) {
......
...@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) { ...@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
TEST(Analyzer, analysis_with_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true; FLAGS_IA_enable_tensorrt_subgraph_engine = true;
Argument argument; Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(0));
argument.Set<int>("max_batch_size", new int(3));
argument.Set<int>("workspace_size", new int(1 << 20));
argument.Set<std::string>("precision_mode", new std::string("FP32"));
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
void TestWord2vecPrediction(const std::string &model_path) { void TestWord2vecPrediction(const std::string& model_path) {
NativeConfig config; NativeConfig config;
config.model_dir = model_path; config.model_dir = model_path;
config.use_gpu = false; config.use_gpu = false;
...@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) { ...@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << "data: " LOG(INFO) << "data: "
<< static_cast<float *>(outputs.front().data.data())[i]; << static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i], PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]); result[i]);
} }
} }
......
...@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { ...@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
} }
} }
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, void CreateTrtEngineOp(Node *node, Argument *argument,
framework::proto::BlockDesc *block) { framework::proto::BlockDesc *block) {
PADDLE_ENFORCE(argument->main_dfg.get());
const DataFlowGraph &graph = *(argument->main_dfg);
static int counter{0}; static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock()); PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc; framework::OpDesc desc;
...@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, ...@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
// Set attrs // Set attrs
SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
SetAttr(desc.Proto(), "output_name_mapping", output_mapping); SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
...@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { ...@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
*block_desc.Proto()->mutable_vars() = *block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars(); argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); CreateTrtEngineOp(node, argument_, block_desc.Proto());
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops(); auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
......
...@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } ...@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
void SubGraphFuse::ReplaceNodesWithSubGraphs() { void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) { for (auto &subgraph : subgraphs) {
if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
continue;
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end()); std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block // replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/inference/analysis/node.h"
...@@ -63,8 +64,11 @@ class SubGraphFuse { ...@@ -63,8 +64,11 @@ class SubGraphFuse {
public: public:
using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
: graph_(graph), node_inside_subgraph_teller_(teller) {} Argument *argument)
: graph_(graph),
node_inside_subgraph_teller_(teller),
argument_(argument) {}
// The main method which run all the logic. // The main method which run all the logic.
void operator()(); void operator()();
...@@ -76,6 +80,7 @@ class SubGraphFuse { ...@@ -76,6 +80,7 @@ class SubGraphFuse {
private: private:
DataFlowGraph *graph_; DataFlowGraph *graph_;
NodeInsideSubgraphTeller node_inside_subgraph_teller_; NodeInsideSubgraphTeller node_inside_subgraph_teller_;
Argument *argument_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) { ...@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
TEST(SubGraphSplitter, Fuse) { TEST(SubGraphSplitter, Fuse) {
auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(3));
size_t count0 = dfg.nodes.size(); size_t count0 = dfg.nodes.size();
SubGraphFuse fuse(&dfg, teller); SubGraphFuse fuse(&dfg, teller, &argument);
fuse(); fuse();
int count1 = 0; int count1 = 0;
......
...@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( ...@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
: node_inside_subgraph_teller_(teller) {} : node_inside_subgraph_teller_(teller) {}
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_)(); SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
VLOG(4) << "debug info " VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/, << graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/); true /*show_functions*/);
......
...@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
bool Initialize(Argument* argument) override { return true; } bool Initialize(Argument* argument) override {
argument_ = argument;
return true;
}
// This class get a sub-graph as input and determine whether to transform this // This class get a sub-graph as input and determine whether to transform this
// sub-graph into TensorRT. // sub-graph into TensorRT.
...@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
private: private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_; NodeInsideSubgraphTeller node_inside_subgraph_teller_;
Argument* argument_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) { ...@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
}; };
Argument argument(FLAGS_inference_model_dir); Argument argument(FLAGS_inference_model_dir);
argument.Set<int>("minimum_subgraph_size", new int(0));
argument.Set<int>("max_batch_size", new int(3));
argument.Set<int>("workspace_size", new int(1 << 20));
argument.Set<std::string>("precision_mode", new std::string("FP32"));
DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
......
...@@ -76,10 +76,10 @@ bool AnalysisPredictor::Init( ...@@ -76,10 +76,10 @@ bool AnalysisPredictor::Init(
} }
OptimizeInferenceProgram(); OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) { if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_); executor_->EnableMKLDNN(*inference_program_);
} }
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables"; VLOG(5) << "to create variables";
PADDLE_ENFORCE(scope_.get()); PADDLE_ENFORCE(scope_.get());
......
...@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) { bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true; FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
FLAGS_tensorrt_workspace_size = config_.workspace_size;
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else { } else {
...@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void OptimizeInferenceProgram() { void OptimizeInferenceProgram() {
// Analyze inference_program // Analyze inference_program
Argument argument; Argument argument;
argument.Set<int>("minimum_subgraph_size",
new int(config_.minimum_subgraph_size));
argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
argument.Set<int>("workspace_size", new int(config_.workspace_size));
argument.Set<std::string>("precision_mode",
new std::string(config_.precision_mode));
if (!config_.model_dir.empty()) { if (!config_.model_dir.empty()) {
argument.fluid_model_dir.reset(new std::string(config_.model_dir)); argument.fluid_model_dir.reset(new std::string(config_.model_dir));
} else { } else {
......
...@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig { ...@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig {
// For workspace_size, refer it from here: // For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int workspace_size{1 << 30}; int workspace_size{1 << 30};
// We transform the Ops that can be converted into TRT layer in the model,
// and aggregate these Ops into subgraphs for TRT execution.
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int minimum_subgraph_size = 3;
// Reserved configuration
// We just support "FP32" now, "FP16" and "INT8" will be supported.
std::string precision_mode = "FP32";
}; };
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
...@@ -204,10 +212,11 @@ struct AnalysisConfig : public NativeConfig { ...@@ -204,10 +212,11 @@ struct AnalysisConfig : public NativeConfig {
kExclude // Specify the disabled passes in `ir_passes`. kExclude // Specify the disabled passes in `ir_passes`.
}; };
// Determine whether to perform graph optimization.
bool enable_ir_optim = true; bool enable_ir_optim = true;
// Manually determine the IR passes to run.
IrPassMode ir_mode{IrPassMode::kExclude}; IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default. std::vector<std::string> ir_passes;
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
// NOTE this is just for internal development, please not use it. // NOTE this is just for internal development, please not use it.
bool _use_mkldnn{false}; bool _use_mkldnn{false};
......
...@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi ...@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
# seq_conv1
set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
# ocr # ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR}) if (NOT EXISTS ${OCR_INSTALL_DIR})
...@@ -85,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI ...@@ -85,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
DEPS inference_anakin_api_shared dynload_cuda SERIAL) DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif() endif()
endif() endif()
if(WITH_GPU AND TENSORRT_FOUND)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
cc_test(test_trt_models SRCS trt_models_tester.cc
ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
DEPS paddle_inference_tensorrt_subgraph_engine)
endif()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
struct DataRecord {
std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
std::vector<std::vector<int64_t>> title1, title2, title3, l1;
std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= title1_all.size()) {
data.title1_all.assign(title1_all.begin() + batch_iter,
title1_all.begin() + batch_end);
data.title2_all.assign(title2_all.begin() + batch_iter,
title2_all.begin() + batch_end);
data.title3_all.assign(title3_all.begin() + batch_iter,
title3_all.begin() + batch_end);
data.l1_all.assign(l1_all.begin() + batch_iter,
l1_all.begin() + batch_end);
// Prepare LoDs
data.title1_lod.push_back(0);
data.title2_lod.push_back(0);
data.title3_lod.push_back(0);
data.l1_lod.push_back(0);
CHECK(!data.title1_all.empty());
CHECK(!data.title2_all.empty());
CHECK(!data.title3_all.empty());
CHECK(!data.l1_all.empty());
CHECK_EQ(data.title1_all.size(), data.title2_all.size());
CHECK_EQ(data.title1_all.size(), data.title3_all.size());
CHECK_EQ(data.title1_all.size(), data.l1_all.size());
for (size_t j = 0; j < data.title1_all.size(); j++) {
data.title1.push_back(data.title1_all[j]);
data.title2.push_back(data.title2_all[j]);
data.title3.push_back(data.title3_all[j]);
data.l1.push_back(data.l1_all[j]);
// calculate lod
data.title1_lod.push_back(data.title1_lod.back() +
data.title1_all[j].size());
data.title2_lod.push_back(data.title2_lod.back() +
data.title2_all[j].size());
data.title3_lod.push_back(data.title3_lod.back() +
data.title3_all[j].size());
data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, '\t', &data);
// load title1 data
std::vector<int64_t> title1_data;
split_to_int64(data[0], ' ', &title1_data);
// load title2 data
std::vector<int64_t> title2_data;
split_to_int64(data[1], ' ', &title2_data);
// load title3 data
std::vector<int64_t> title3_data;
split_to_int64(data[2], ' ', &title3_data);
// load l1 data
std::vector<int64_t> l1_data;
split_to_int64(data[3], ' ', &l1_data);
title1_all.push_back(std::move(title1_data));
title2_all.push_back(std::move(title2_data));
title3_all.push_back(std::move(title3_data));
l1_all.push_back(std::move(l1_data));
}
num_samples = num_lines;
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor;
title1_tensor.name = "title1";
title2_tensor.name = "title2";
title3_tensor.name = "title3";
l1_tensor.name = "l1";
auto one_batch = data->NextBatch();
int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
title1_tensor.shape.assign({title1_size, 1});
title1_tensor.lod.assign({one_batch.title1_lod});
int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
title2_tensor.shape.assign({title2_size, 1});
title2_tensor.lod.assign({one_batch.title2_lod});
int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
title3_tensor.shape.assign({title3_size, 1});
title3_tensor.lod.assign({one_batch.title3_lod});
int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
l1_tensor.shape.assign({l1_size, 1});
l1_tensor.lod.assign({one_batch.l1_lod});
// assign data
TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
// Set inputs.
input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::INT64;
}
}
void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int bid = 0; bid < epoch; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
// Easy for profiling independently.
TEST(Analyzer_seq_conv1, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
// output is probability, which is in (0, 1).
for (size_t i = 0; i < size; i++) {
EXPECT_GT(result[i], 0);
EXPECT_LT(result[i], 1);
}
}
}
// Check the fuse status
TEST(Analyzer_seq_conv1, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_seq_conv1, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
using paddle::contrib::MixedRTConfig;
DEFINE_string(dirname, "", "Directory of the inference model.");
NativeConfig GetConfigNative() {
NativeConfig config;
config.model_dir = FLAGS_dirname;
// LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.45;
config.use_gpu = true;
config.device = 0;
return config;
}
MixedRTConfig GetConfigTRT() {
MixedRTConfig config;
config.model_dir = FLAGS_dirname;
config.use_gpu = true;
config.fraction_of_gpu_memory = 0.2;
config.device = 0;
config.max_batch_size = 3;
return config;
}
void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
NativeConfig config0 = GetConfigNative();
config0.model_dir = model_dirname;
MixedRTConfig config1 = GetConfigTRT();
config1.model_dir = model_dirname;
config1.max_batch_size = batch_size;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1);
// Prepare inputs
int height = 224;
int width = 224;
float *data = new float[batch_size * 3 * height * width];
memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
data[0] = 1.0f;
// Prepare inputs
PaddleTensor tensor;
tensor.name = "input_0";
tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data = PaddleBuf(static_cast<void *>(data),
sizeof(float) * (batch_size * 3 * height * width));
tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
// Prepare outputs
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
// Get output.
ASSERT_EQ(outputs0.size(), 1UL);
ASSERT_EQ(outputs1.size(), 1UL);
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
}
}
TEST(trt_models_test, main) {
std::vector<std::string> infer_models = {"mobilenet", "resnet50",
"resnext50"};
for (auto &model_dir : infer_models) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
}
}
} // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel { ...@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
} }
}; };
__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC( UNUSED constexpr char SigmoidDoc[] = R"DOC(
Sigmoid Activation Operator Sigmoid Activation Operator
$$out = \frac{1}{1 + e^{-x}}$$ $$out = \frac{1}{1 + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC( UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
Logsigmoid Activation Operator Logsigmoid Activation Operator
$$out = \\log \\frac{1}{1 + e^{-x}}$$ $$out = \\log \\frac{1}{1 + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char ExpDoc[] = R"DOC( UNUSED constexpr char ExpDoc[] = R"DOC(
Exp Activation Operator. Exp Activation Operator.
$out = e^x$ $out = e^x$
)DOC"; )DOC";
__attribute__((unused)) constexpr char ReluDoc[] = R"DOC( UNUSED constexpr char ReluDoc[] = R"DOC(
Relu Activation Operator. Relu Activation Operator.
$out = \max(x, 0)$ $out = \max(x, 0)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char TanhDoc[] = R"DOC( UNUSED constexpr char TanhDoc[] = R"DOC(
Tanh Activation Operator. Tanh Activation Operator.
$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( UNUSED constexpr char TanhShrinkDoc[] = R"DOC(
TanhShrink Activation Operator. TanhShrink Activation Operator.
$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC( UNUSED constexpr char SqrtDoc[] = R"DOC(
Sqrt Activation Operator. Sqrt Activation Operator.
$out = \sqrt{x}$ $out = \sqrt{x}$
)DOC"; )DOC";
__attribute__((unused)) constexpr char AbsDoc[] = R"DOC( UNUSED constexpr char AbsDoc[] = R"DOC(
Abs Activation Operator. Abs Activation Operator.
$out = |x|$ $out = |x|$
)DOC"; )DOC";
__attribute__((unused)) constexpr char CeilDoc[] = R"DOC( UNUSED constexpr char CeilDoc[] = R"DOC(
Ceil Activation Operator. Ceil Activation Operator.
$out = ceil(x)$ $out = ceil(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char FloorDoc[] = R"DOC( UNUSED constexpr char FloorDoc[] = R"DOC(
Floor Activation Operator. Floor Activation Operator.
$out = floor(x)$ $out = floor(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char CosDoc[] = R"DOC( UNUSED constexpr char CosDoc[] = R"DOC(
Cosine Activation Operator. Cosine Activation Operator.
$out = cos(x)$ $out = cos(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SinDoc[] = R"DOC( UNUSED constexpr char SinDoc[] = R"DOC(
Sine Activation Operator. Sine Activation Operator.
$out = sin(x)$ $out = sin(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char RoundDoc[] = R"DOC( UNUSED constexpr char RoundDoc[] = R"DOC(
Round Activation Operator. Round Activation Operator.
$out = [x]$ $out = [x]$
)DOC"; )DOC";
__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC( UNUSED constexpr char ReciprocalDoc[] = R"DOC(
Reciprocal Activation Operator. Reciprocal Activation Operator.
$$out = \\frac{1}{x}$$ $$out = \\frac{1}{x}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char LogDoc[] = R"DOC( UNUSED constexpr char LogDoc[] = R"DOC(
Log Activation Operator. Log Activation Operator.
$out = \ln(x)$ $out = \ln(x)$
...@@ -212,21 +213,21 @@ Natural logarithm of x. ...@@ -212,21 +213,21 @@ Natural logarithm of x.
)DOC"; )DOC";
__attribute__((unused)) constexpr char SquareDoc[] = R"DOC( UNUSED constexpr char SquareDoc[] = R"DOC(
Square Activation Operator. Square Activation Operator.
$out = x^2$ $out = x^2$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC( UNUSED constexpr char SoftplusDoc[] = R"DOC(
Softplus Activation Operator. Softplus Activation Operator.
$out = \ln(1 + e^{x})$ $out = \ln(1 + e^{x})$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC( UNUSED constexpr char SoftsignDoc[] = R"DOC(
Softsign Activation Operator. Softsign Activation Operator.
$$out = \frac{x}{1 + |x|}$$ $$out = \frac{x}{1 + |x|}$$
......
...@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
"Out and Label should have same height."); "Out and Label should have same height.");
int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1; int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
int slide_steps = ctx->Attrs().Get<int>("slide_steps");
PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
ctx->SetOutputDim("AUC", {1}); ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("BatchAUC", {1});
ctx->SetOutputDim("StatPosOut", {num_pred_buckets}); slide_steps = slide_steps == 0 ? 1 : slide_steps;
ctx->SetOutputDim("StatNegOut", {num_pred_buckets}); ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
} }
protected: protected:
...@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data. " "A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]"); "shape: [batch_size, 1]");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddInput("StatPos", "Statistic value when label = 1"); AddInput("StatPos", "Statistic value when label = 1");
AddInput("StatNeg", "Statistic value when label = 0"); AddInput("StatNeg", "Statistic value when label = 0");
...@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-the-curve."); "current area-under-the-curve.");
AddOutput("BatchAUC", "The AUC for current batch");
AddOutput("StatPosOut", "Statistic value when label = 1"); AddOutput("StatPosOut", "Statistic value when label = 1");
AddOutput("StatNegOut", "Statistic value when label = 0"); AddOutput("StatNegOut", "Statistic value when label = 0");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
AddAttr<int>("num_thresholds", AddAttr<int>(
"The number of thresholds to use when discretizing the" "num_thresholds",
" roc curve.") "The number of thresholds to use when discretizing the roc curve.")
.SetDefault((2 << 12) - 1); .SetDefault((2 << 12) - 1);
AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
.SetDefault(1);
AddComment(R"DOC( AddComment(R"DOC(
Area Under The Curve (AUC) Operator. Area Under The Curve (AUC) Operator.
......
...@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {
std::string curve = ctx.Attr<std::string>("curve"); std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds"); int num_thresholds = ctx.Attr<int>("num_thresholds");
// buckets contain numbers from 0 to num_thresholds
int num_pred_buckets = num_thresholds + 1; int num_pred_buckets = num_thresholds + 1;
int slide_steps = ctx.Attr<int>("slide_steps");
// Only use output var for now, make sure it's persistable and // Only use output var for now, make sure it's persistable and
// not cleaned up for each batch. // not cleaned up for each batch.
...@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
auto *stat_pos = ctx.Output<Tensor>("StatPosOut"); auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
auto *stat_neg = ctx.Output<Tensor>("StatNegOut"); auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace()); auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace()); auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
auc);
auto *batch_auc = ctx.Output<Tensor>("BatchAUC"); std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0); std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(), auto stat_pos_calc = stat_pos_data.data();
num_thresholds, batch_auc); auto stat_neg_calc = stat_neg_data.data();
statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
} }
private: private:
...@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
} }
inline static void calcAuc(const framework::ExecutionContext &ctx, inline static void statAuc(const framework::Tensor *label,
const framework::Tensor *label,
const framework::Tensor *predict, const framework::Tensor *predict,
int64_t *stat_pos, int64_t *stat_neg, const int num_pred_buckets,
int num_thresholds, const int num_thresholds, const int slide_steps,
framework::Tensor *auc_tensor) { int64_t *origin_stat_pos, int64_t *origin_stat_neg,
int64_t **stat_pos, int64_t **stat_neg) {
size_t batch_size = predict->dims()[0]; size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1]; size_t inference_width = predict->dims()[1];
const T *inference_data = predict->data<T>(); const T *inference_data = predict->data<T>();
const auto *label_data = label->data<int64_t>(); const auto *label_data = label->data<int64_t>();
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
for (size_t i = 0; i < batch_size; i++) { for (size_t i = 0; i < batch_size; i++) {
uint32_t binIdx = static_cast<uint32_t>( uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds); inference_data[i * inference_width + 1] * num_thresholds);
if (label_data[i]) { if (label_data[i]) {
stat_pos[binIdx] += 1.0; (*stat_pos)[binIdx] += 1.0;
} else { } else {
stat_neg[binIdx] += 1.0; (*stat_neg)[binIdx] += 1.0;
} }
} }
int bucket_length = num_pred_buckets * sizeof(int64_t);
// will stat auc unlimited.
if (slide_steps == 0) {
for (int slide = 0; slide < num_pred_buckets; ++slide) {
origin_stat_pos[slide] += (*stat_pos)[slide];
origin_stat_neg[slide] += (*stat_neg)[slide];
}
*stat_pos = origin_stat_pos;
*stat_neg = origin_stat_neg;
} else {
for (int slide = 1; slide < slide_steps; ++slide) {
int dst_idx = (slide - 1) * num_pred_buckets;
int src_inx = slide * num_pred_buckets;
std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
bucket_length);
std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
bucket_length);
}
std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
*stat_pos, bucket_length);
std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
*stat_neg, bucket_length);
std::memset(*stat_pos, 0, bucket_length);
std::memset(*stat_neg, 0, bucket_length);
for (int slide = 0; slide < num_pred_buckets; ++slide) {
int stat_pos_steps = 0;
int stat_neg_steps = 0;
for (int step = 0; step < slide_steps; ++step) {
stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
}
(*stat_pos)[slide] += stat_pos_steps;
(*stat_neg)[slide] += stat_neg_steps;
}
}
}
inline static void calcAuc(const framework::ExecutionContext &ctx,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
*auc = 0.0f; *auc = 0.0f;
double totPos = 0.0; double totPos = 0.0;
...@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
totPos += stat_pos[idx]; totPos += stat_pos[idx];
totNeg += stat_neg[idx]; totNeg += stat_neg[idx];
*auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
--idx; --idx;
} }
......
...@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc ...@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu) polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
endif()
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
#Export local libraries to parent #Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { ...@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()), framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
platform::CPUPlace()); ctx.device_context());
} }
}; };
...@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, ...@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
const T *im_info_data = im_info.data<T>(); const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace()); T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
T im_scale = im_info_data[2]; T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1}); keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f); min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace()); int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
...@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("post_nms_topN", "post_nms_topN"); AddAttr<int>("post_nms_topN", "post_nms_topN");
AddAttr<float>("nms_thresh", "nms_thres"); AddAttr<float>("nms_thresh", "nms_thres");
AddAttr<float>("min_size", "min size"); AddAttr<float>("min_size", "min size");
AddAttr<float>("eta", "eta"); AddAttr<float>("eta", "The parameter for adaptive NMS.");
AddComment(R"DOC( AddComment(R"DOC(
Generate Proposals OP Generate Proposals OP
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdio.h>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
namespace {
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const kThreadsPerBlock = sizeof(uint64_t) * 8;
template <typename T>
__global__ void RangeInitKernel(const T start, const T delta, const int size,
T *out) {
CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
}
template <typename T>
void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
Tensor *value_out, Tensor *index_out) {
int num = value.numel();
Tensor index_in_t;
int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream();
RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
const T *keys_in = value.data<T>();
T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
memory::Free(place, d_temp_storage);
}
template <typename T>
__device__ __forceinline__ T Min(T x, T y) {
return x < y ? x : y;
}
template <typename T>
__device__ __forceinline__ T Max(T x, T y) {
return x > y ? x : y;
}
template <typename T>
__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
const T *var, const int *index,
const T *im_info, const int num,
T *proposals) {
T kBBoxClipDefault = log(1000.0 / 16.0);
CUDA_1D_KERNEL_LOOP(i, num) {
int k = index[i] * 4;
T axmin = anchor[k];
T aymin = anchor[k + 1];
T axmax = anchor[k + 2];
T aymax = anchor[k + 3];
T w = axmax - axmin + 1.0;
T h = aymax - aymin + 1.0;
T cx = axmin + 0.5 * w;
T cy = aymin + 0.5 * h;
T dxmin = deltas[k];
T dymin = deltas[k + 1];
T dxmax = deltas[k + 2];
T dymax = deltas[k + 3];
T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
if (var) {
d_cx = cx + dxmin * w * var[k];
d_cy = cy + dymin * h * var[k + 1];
d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
} else {
d_cx = cx + dxmin * w;
d_cy = cy + dymin * h;
d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
}
T oxmin = d_cx - d_w * 0.5;
T oymin = d_cy - d_h * 0.5;
T oxmax = d_cx + d_w * 0.5 - 1.;
T oymax = d_cy + d_h * 0.5 - 1.;
proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
}
}
template <typename T, int BlockSize>
__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
const T min_size, const int num, int *keep_num,
int *keep) {
T im_h = im_info[0];
T im_w = im_info[1];
T im_scale = im_info[2];
int cnt = 0;
__shared__ int keep_index[BlockSize];
CUDA_1D_KERNEL_LOOP(i, num) {
keep_index[threadIdx.x] = -1;
__syncthreads();
int k = i * 4;
T xmin = bboxes[k];
T ymin = bboxes[k + 1];
T xmax = bboxes[k + 2];
T ymax = bboxes[k + 3];
T w = xmax - xmin + 1.0;
T h = ymax - ymin + 1.0;
T cx = xmin + w / 2.;
T cy = ymin + h / 2.;
T w_s = (xmax - xmin) / im_scale + 1.;
T h_s = (ymax - ymin) / im_scale + 1.;
if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
keep_index[threadIdx.x] = i;
}
__syncthreads();
if (threadIdx.x == 0) {
int size = (num - i) < BlockSize ? num - i : BlockSize;
for (int j = 0; j < size; ++j) {
if (keep_index[j] > -1) {
keep[cnt++] = keep_index[j];
}
}
}
__syncthreads();
}
if (threadIdx.x == 0) {
keep_num[0] = cnt;
}
}
__device__ inline float IoU(const float *a, const float *b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float inter_s = width * height;
float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return inter_s / (s_a + s_b - inter_s);
}
__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, uint64_t *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
const int row_size =
min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
const int col_size =
min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
__shared__ float block_boxes[kThreadsPerBlock * 4];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 4 + 0] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
block_boxes[threadIdx.x * 4 + 1] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
block_boxes[threadIdx.x * 4 + 2] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
block_boxes[threadIdx.x * 4 + 3] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
uint64_t t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
template <typename T>
void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const Tensor &sorted_indices, const T nms_threshold,
Tensor *keep_out) {
int boxes_num = proposals.dims()[0];
PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
DIVUP(boxes_num, kThreadsPerBlock));
dim3 threads(kThreadsPerBlock);
const T *boxes = proposals.data<T>();
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
uint64_t *d_mask =
reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
uint64_t *h_mask = reinterpret_cast<uint64_t *>(
memory::Alloc(platform::CPUPlace(), size_bytes));
memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<int> keep_vec;
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / kThreadsPerBlock;
int inblock = i % kThreadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
++num_to_keep;
keep_vec.push_back(i);
uint64_t *p = &h_mask[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
sizeof(int) * num_to_keep, 0);
memory::Free(place, d_mask);
memory::Free(platform::CPUPlace(), h_mask);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const platform::CUDADeviceContext &ctx, const Tensor &im_info,
const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas, // [M, 4]
const Tensor &scores, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
// 1. pre nms
Tensor scores_sort, index_sort;
SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
int num = scores.numel();
int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
: pre_nms_top_n;
scores_sort.Resize({pre_nms_num, 1});
index_sort.Resize({pre_nms_num, 1});
// 2. box decode and clipping
Tensor proposals;
proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream();
BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
proposals.data<T>());
// 3. filter
Tensor keep_index, keep_num_t;
keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
min_size = std::max(min_size, 1.0f);
FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
keep_num_t.data<int>(), keep_index.data<int>());
int keep_num;
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
keep_num_t.data<int>(), sizeof(int), 0);
keep_index.Resize({keep_num});
Tensor scores_filter, proposals_filter;
proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
if (nms_thresh <= 0) {
return std::make_pair(proposals_filter, scores_filter);
}
// 4. nms
Tensor keep_nms;
NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
Tensor scores_nms, proposals_nms;
proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
return std::make_pair(proposals_nms, scores_nms);
}
} // namespace
template <typename DeviceContext, typename T>
class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *scores = context.Input<Tensor>("Scores");
auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
auto *im_info = context.Input<Tensor>("ImInfo");
auto *anchors = context.Input<Tensor>("Anchors");
auto *variances = context.Input<Tensor>("Variances");
auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
int post_nms_top_n = context.Attr<int>("post_nms_topN");
float nms_thresh = context.Attr<float>("nms_thresh");
float min_size = context.Attr<float>("min_size");
float eta = context.Attr<float>("eta");
PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
auto &dev_ctx = context.template device_context<DeviceContext>();
auto scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
dev_ctx.GetPlace());
scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
dev_ctx.GetPlace());
math::Transpose<DeviceContext, T, 4> trans;
std::vector<int> axis = {0, 2, 3, 1};
trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
trans(dev_ctx, *scores, &scores_swap, axis);
Tensor *anchor = const_cast<framework::Tensor *>(anchors);
anchor->Resize({anchors->numel() / 4, 4});
Tensor *var = const_cast<framework::Tensor *>(variances);
var->Resize({var->numel() / 4, 4});
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
T *rpn_rois_data = rpn_rois->data<T>();
T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
int64_t num_proposals = 0;
std::vector<size_t> offset(1, 0);
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> box_score_pair =
ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
bbox_deltas_slice, scores_slice, pre_nms_top_n,
post_nms_top_n, nms_thresh, min_size, eta);
Tensor proposals = box_score_pair.first;
Tensor scores = box_score_pair.second;
memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
scores.data<T>(), sizeof(T) * scores.numel(), 0);
num_proposals += proposals.dims()[0];
offset.emplace_back(num_proposals);
}
framework::LoD lod;
lod.emplace_back(offset);
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(generate_proposals,
ops::CUDAGenerateProposalsKernel<
paddle::platform::CUDADeviceContext, float>);
...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto label_lod = in_label->lod(); auto& label_lod = in_label->lod();
auto detect_lod = in_detect->lod(); auto& detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto label_lod = input_label.lod(); auto& label_lod = input_label.lod();
auto detect_lod = input_detect.lod(); auto& detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto label_index = label_lod[0]; auto& label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
...@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
...@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod()[0]; auto& pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
...@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto image_gt_boxes = gt_boxes[n]; auto& image_gt_boxes = gt_boxes[n];
for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { for (auto& image_gt_box : image_gt_boxes) {
size_t count = 0; size_t count = 0;
auto labeled_bboxes = it->second; auto& labeled_bboxes = image_gt_box.second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (size_t i = 0; i < labeled_bboxes.size(); ++i) for (auto& box : labeled_bboxes) {
if (!(labeled_bboxes[i].is_difficult)) ++count; if (!box.is_difficult) {
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = it->first; int label = image_gt_box.first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {
......
...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { ...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto in_rows = in.rows(); auto &in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
......
...@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE_EQ( if (ctx->Attrs().Get<bool>("use_peepholes")) {
b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size, PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be " "The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection or" "7 * %d if enable peepholes connection",
"4 * %d if disable peepholes", frame_size);
frame_size, frame_size); ctx->SetOutputDim("CheckedCell", {2, frame_size});
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes",
frame_size);
}
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
...@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() { ...@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
.AsIntermediate();
AddAttr<bool>("use_peepholes", AddAttr<bool>("use_peepholes",
"(bool, defalut: True) " "(bool, defalut: True) "
"whether to enable diagonal/peephole connections.") "whether to enable diagonal/peephole connections.")
...@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const int D3 = D * 3; \ const int D3 = D * 3; \
const int D4 = wh_dims[1]; const int D4 = wh_dims[1];
#define INIT_BASE_INPUT_DATAS \ #define INIT_BASE_INPUT_DATAS \
const T* x_data = x->data<T>(); \ const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \ const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \ const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \ /* diagonal weight*/ \
const T* wc_data = bias->data<T>() + D4; \ const T* wc_data = bias->data<T>() + D4; \
/* for peephole only*/ \ /* for peephole only*/ \
Tensor checked_cell; \ T* checked_cell_data = nullptr; \
T* checked_cell_data = nullptr; \ auto place = ctx.GetPlace(); \
auto place = ctx.GetPlace(); \ if (use_peepholes) { \
if (use_peepholes) { \ /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \ checked_cell_data = checked_cell->mutable_data<T>(place); \
} }
/// Compute LSTM /// Compute LSTM
......
...@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here. // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(), memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place, gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
......
...@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(out_place), out_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data, in1_value.numel() * sizeof(T), context.stream());
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
...@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
......
...@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims, ...@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims,
dims2str(paddings) + pooling_type + suffix; dims2str(paddings) + pooling_type + suffix;
} }
static inline int ComputeCeiledOutput(int input_size, int kernel_size,
int padding, int stride) {
return (input_size - kernel_size + 2 * padding) / stride + 1;
}
static inline void CorrectOutputSize(
const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
const std::vector<int>& kernel_size, const std::vector<int>& paddings,
const std::vector<int>& strides,
std::vector<int>& right_bot_padding) { // NOLINT
for (size_t i = 0; i < right_bot_padding.size(); i++) {
int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
paddings[i], strides[i]);
if (desired_size != dst_tz[i + 2]) {
right_bot_padding[i] += strides[i];
}
}
}
template <typename T> template <typename T>
class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
...@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto pool_p = auto pool_p =
std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p)); std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
if (pool_p == nullptr) { if (pool_p == nullptr) {
const std::vector<int>& padding_left_top(paddings);
std::vector<int> padding_right_bottom(paddings);
bool ceil_mode = ctx.Attr<bool>("ceil_mode");
if (ceil_mode) {
CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
padding_right_bottom);
}
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), input_format); src_tz, platform::MKLDNNGetDataType<T>(), input_format);
...@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn::memory::format::any); mkldnn::memory::format::any);
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd = std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
pooling_type, mkldnn_engine); padding_right_bottom, ksize, pooling_type,
mkldnn_engine, ceil_mode);
// save pool_pd into global device context to be referred in backward path // save pool_pd into global device context to be referred in backward path
dev_ctx.SetBlob(key_pool_pd, pool_pd); dev_ctx.SetBlob(key_pool_pd, pool_pd);
...@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
private: private:
std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc( std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
const std::vector<int>& stride, const std::vector<int>& padding, const std::vector<int>& stride, const std::vector<int>& padding_left_top,
const std::vector<int>& kernel, const std::string& pooling_type, const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
const mkldnn::engine& engine) const { const std::string& pooling_type, const mkldnn::engine& engine,
bool ceil_mode) const {
auto pool_desc = mkldnn::pooling_forward::desc( auto pool_desc = mkldnn::pooling_forward::desc(
mkldnn::prop_kind::forward, mkldnn::prop_kind::forward,
pooling_type == "max" ? mkldnn::algorithm::pooling_max pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg, : mkldnn::algorithm::pooling_avg,
src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); src, dst, stride, kernel, padding_left_top, padding_right_bot,
mkldnn::padding_kind::zero);
auto p_pool_pd = auto p_pool_pd =
new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
......
...@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
SamplingId Operator. SamplingId Operator.
A layer for sampling id from multinomial distribution from the A layer for sampling id from multinomial distribution from the
input. Sampling one id for one sample.)DOC"); input. Sampling one id for one sample.)DOC");
AddAttr<float>("min", "Minimum value of random. [default 0.0].") AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("max", "Maximun value of random. [default 1.0].") AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
.SetDefault(1.0f); .SetDefault(1.0f);
AddAttr<int>("seed", AddAttr<int>(
"Random seed used for the random number engine. " "seed",
"0 means use a seed generated by the system." "Random seed used for the random number engine. "
"Note that if seed is not 0, this operator will always " "0 means use a seed generated by the system."
"generate the same random numbers every time. [default 0].") "Note that if seed is not 0, this operator will always "
"generate the same random numbers every time. (int, default 0).")
.SetDefault(0); .SetDefault(0);
} }
}; };
......
...@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference { ...@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name); auto *out_var = block->FindVarRecursive(out_var_name);
out_var->SetType(in_var.GetType()); if (in_var_name != out_var_name) {
out_var->SetDataType(in_var.GetDataType()); out_var->SetType(in_var.GetType());
out_var->SetDataType(in_var.GetDataType());
}
} }
}; };
......
...@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
} }
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_LT(0, offset_data[i], PADDLE_ENFORCE_LE(0, offset_data[i],
"The offset[%d] must greater than zero.", i); "The offset[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(0, length_data[i], PADDLE_ENFORCE_LT(0, length_data[i],
"The length[%d] must greater than zero.", i); "The length[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
lod[0][i + 1], "The target tensor's length overflow."); lod[0][i + 1], "The target tensor's length overflow.");
} }
......
...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value(); auto& in_value = grad->value();
framework::Vector<int64_t> in_rows(grad->rows()); auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
......
...@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X"); auto in_vars = context.MultiInputVar("X");
int N = in_vars.size(); size_t in_num = in_vars.size();
auto out_var = context.OutputVar("Out"); auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0]; bool in_place = out_var == in_vars[0];
...@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
auto &place = auto &place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
// If in_place, just skip the first tensor // If in_place, just skip the first tensor
for (int i = in_place ? 1 : 0; i < N; i++) { for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) { if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>(); auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) { if (in_t.numel() == 0) {
...@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape // Runtime InferShape
size_t first_dim = 0; size_t first_dim = 0;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size(); first_dim += sel_row.rows().size();
} }
std::vector<int64_t> in_dim; std::vector<int64_t> in_dim;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() > 0) { if (sel_row.rows().size() > 0) {
in_dim = framework::vectorize(sel_row.value().dims()); in_dim = framework::vectorize(sel_row.value().dims());
...@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
} }
if (in_dim.empty()) { if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty"; VLOG(3) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); in_dim =
framework::vectorize(get_selected_row(in_num - 1).value().dims());
} else { } else {
in_dim[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
} }
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
...@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
math::SelectedRowsAddTo<DeviceContext, T> functor; math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() == 0) { if (sel_row.rows().size() == 0) {
continue; continue;
......
...@@ -22,8 +22,6 @@ ...@@ -22,8 +22,6 @@
namespace paddle { namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
namespace operators { namespace operators {
...@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Ys", "A list of outputs").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph."); AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine."); AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
AddAttr<int>("max_batch_size", "the maximum batch size.");
AddAttr<int>("workspace_size", "the workspace size.");
AddComment("TensorRT engine operator."); AddComment("TensorRT engine operator.");
} }
}; };
......
...@@ -28,8 +28,6 @@ ...@@ -28,8 +28,6 @@
namespace paddle { namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size); DECLARE_int32(tensorrt_engine_batch_size);
DECLARE_int32(tensorrt_max_batch_size);
DECLARE_int32(tensorrt_workspace_size);
namespace operators { namespace operators {
...@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto engine_name = context.Attr<std::string>("engine_uniq_key"); auto engine_name = context.Attr<std::string>("engine_uniq_key");
int max_batch_size = context.Attr<int>("max_batch_size");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) { if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context); Prepare(context);
} }
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name); auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs"); auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
FLAGS_tensorrt_max_batch_size);
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping"); context.Attr<std::vector<std::string>>("output_name_mapping");
...@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Get the ProgramDesc and pass to convert. // Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph")); block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
int max_batch = FLAGS_tensorrt_max_batch_size; int max_batch_size = context.Attr<int>("max_batch_size");
auto max_workspace = FLAGS_tensorrt_workspace_size; int workspace_size = context.Attr<int>("workspace_size");
auto params = context.Attr<std::vector<std::string>>("parameters"); auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
for (const auto& param : params) { for (const auto& param : params) {
...@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// TODO(Superjomn) replace this with a different stream // TODO(Superjomn) replace this with a different stream
auto* engine = Singleton<TRT_EngineManager>::Global().Create( auto* engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch, max_workspace, nullptr /*engine hold its own stream*/, max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"), context.Attr<std::string>("engine_uniq_key"),
boost::get<platform::CUDAPlace>(context.GetPlace()).device); boost::get<platform::CUDAPlace>(context.GetPlace()).device);
......
...@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, ...@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
using inference::analysis::SetAttr; using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) { TEST(TensorRTEngineOp, manual) {
FLAGS_tensorrt_engine_batch_size = 2;
FLAGS_tensorrt_max_batch_size = 2;
framework::ProgramDesc program; framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0); block_->set_idx(0);
...@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) { ...@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters", SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
std::vector<std::string>({})); std::vector<std::string>({}));
...@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) { ...@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
} }
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
FLAGS_tensorrt_engine_batch_size = batch_size;
FLAGS_tensorrt_max_batch_size = batch_size;
framework::ProgramDesc program; framework::ProgramDesc program;
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
...@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size); SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10); SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
SetAttr<std::vector<std::string>>( SetAttr<std::vector<std::string>>(
engine_op_desc.Proto(), "parameters", engine_op_desc.Proto(), "parameters",
std::vector<std::string>({"y0", "y1", "y2", "y3"})); std::vector<std::string>({"y0", "y1", "y2", "y3"}));
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#if defined(_WIN32) #if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max #define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#endif #endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -47,7 +48,7 @@ limitations under the License. */ ...@@ -47,7 +48,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/curand.h" #include "paddle/fluid/platform/dynload/curand.h"
#if !defined(__APPLE__) and !defined(_WIN32) #if !defined(__APPLE__) && !defined(_WIN32)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif // __APPLE__ #endif // __APPLE__
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
#endif #endif
} }
#if !defined(__APPLE__) and !defined(_WIN32) #if !defined(__APPLE__) && !defined(_WIN32)
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
ncclResult_t stat, const Args&... args) { ncclResult_t stat, const Args&... args) {
...@@ -260,14 +261,8 @@ inline void throw_on_error(T e) { ...@@ -260,14 +261,8 @@ inline void throw_on_error(T e) {
} \ } \
} while (false) } while (false)
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
} while (false)
#else #else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__) #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif // REPLACE_ENFORCE_GLOG #endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32 #else // !_WIN32
...@@ -281,6 +276,12 @@ inline void throw_on_error(T e) { ...@@ -281,6 +276,12 @@ inline void throw_on_error(T e) {
#define PADDLE_ENFORCE(x, ...) x #define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32 #endif // !_WIN32
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
} while (false)
/* /*
* Some enforce helpers here, usage: * Some enforce helpers here, usage:
* int a = 1; * int a = 1;
...@@ -294,7 +295,7 @@ inline void throw_on_error(T e) { ...@@ -294,7 +295,7 @@ inline void throw_on_error(T e) {
* extra messages is also supported, for example: * extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
*/ */
#if !defined(_WIN32)
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
...@@ -307,6 +308,7 @@ inline void throw_on_error(T e) { ...@@ -307,6 +308,7 @@ inline void throw_on_error(T e) {
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \ do { \
if (UNLIKELY(nullptr == (__VAL))) { \ if (UNLIKELY(nullptr == (__VAL))) { \
...@@ -326,6 +328,27 @@ inline void throw_on_error(T e) { ...@@ -326,6 +328,27 @@ inline void throw_on_error(T e) {
paddle::string::Sprintf("" __VA_ARGS__)); \ paddle::string::Sprintf("" __VA_ARGS__)); \
} \ } \
} while (0) } while (0)
#else
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \
if (!((__VAL0)__CMP(__VAL1))) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed."); \
} \
} while (0)
#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \
do { \
if (nullptr == (__VAL1)) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
} \
} while (0)
#endif // !_WIN32
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) { ...@@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) {
.def("set_output", &pd::OpDesc::SetOutput) .def("set_output", &pd::OpDesc::SetOutput)
.def("input_arg_names", &pd::OpDesc::InputArgumentNames) .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
.def("output_arg_names", &pd::OpDesc::OutputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
.def("rename_input", &pd::OpDesc::RenameInput) .def("_rename_input", &pd::OpDesc::RenameInput)
.def("rename_output", &pd::OpDesc::RenameOutput) .def("_rename_output", &pd::OpDesc::RenameOutput)
.def("has_attr", &pd::OpDesc::HasAttr) .def("has_attr", &pd::OpDesc::HasAttr)
.def("attr_type", &pd::OpDesc::GetAttrType) .def("attr_type", &pd::OpDesc::GetAttrType)
.def("attr_names", &pd::OpDesc::AttrNames) .def("attr_names", &pd::OpDesc::AttrNames)
.def("set_attr", &pd::OpDesc::SetAttr) .def("_set_attr", &pd::OpDesc::SetAttr)
.def("attr", &pd::OpDesc::GetAttr) .def("attr", &pd::OpDesc::GetAttr)
.def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
.def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
...@@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) { ...@@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) {
std::string ser(seriralized); std::string ser(seriralized);
self.SetAttr(name, ser); self.SetAttr(name, ser);
}) })
.def("block_attr_id", &pd::OpDesc::GetBlockAttrId) .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
.def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
.def("check_attrs", &pd::OpDesc::CheckAttrs) .def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape) .def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType) .def("infer_var_type", &pd::OpDesc::InferVarType)
......
function(train_test TARGET_NAME)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(train_test_ARGS)
foreach(arg ${train_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_train_${TARGET_NAME}${arg}
SRCS test_train_${TARGET_NAME}.cc
DEPS paddle_fluid_origin
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
endforeach()
endfunction(train_test)
if(WITH_TESTING)
train_test(recognize_digits ARGS mlp conv)
endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <fstream>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
DEFINE_string(dirname, "", "Directory of the train model.");
namespace paddle {
void Train() {
CHECK(!FLAGS_dirname.empty());
framework::InitDevices(false);
const auto cpu_place = platform::CPUPlace();
framework::Executor executor(cpu_place);
framework::Scope scope;
auto train_program = inference::Load(
&executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
FLAGS_dirname + "__params_combined__");
std::string loss_name = "";
for (auto op_desc : train_program->Block(0).AllOps()) {
if (op_desc->Type() == "mean") {
loss_name = op_desc->Output("Out")[0];
break;
}
}
PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
// prepare data
auto x_var = scope.Var("img");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
x_tensor->Resize({64, 1, 28, 28});
auto x_data = x_tensor->mutable_data<float>(cpu_place);
for (int i = 0; i < 64 * 28 * 28; ++i) {
x_data[i] = 1.0;
}
auto y_var = scope.Var("label");
auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
y_tensor->Resize({64, 1});
auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
for (int i = 0; i < 64 * 1; ++i) {
y_data[i] = static_cast<int64_t>(1);
}
auto loss_var = scope.Var(loss_name);
float first_loss = 0.0;
float last_loss = 0.0;
for (int i = 0; i < 100; ++i) {
executor.Run(*train_program.get(), &scope, 0, false, true);
if (i == 0) {
first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
} else if (i == 99) {
last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
}
}
EXPECT_LT(last_loss, first_loss);
}
TEST(train, recognize_digits) { Train(); }
} // namespace paddle
...@@ -70,8 +70,8 @@ function cmake_gen() { ...@@ -70,8 +70,8 @@ function cmake_gen() {
PYTHON_FLAGS="" PYTHON_FLAGS=""
SYSTEM=`uname -s` SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ]; then if [ "$SYSTEM" == "Darwin" ]; then
echo "Using python abi: $1"
if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
echo "using python abi: $1"
if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
...@@ -82,7 +82,18 @@ function cmake_gen() { ...@@ -82,7 +82,18 @@ function cmake_gen() {
else else
exit 1 exit 1
fi fi
# TODO: qiyang add python3 part here elif [ "$1" == "cp35-cp35m" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
else
exit 1
fi
fi fi
else else
if [ "$1" != "" ]; then if [ "$1" != "" ]; then
...@@ -381,7 +392,7 @@ function run_mac_test() { ...@@ -381,7 +392,7 @@ function run_mac_test() {
EOF EOF
# TODO: jiabin need to refine this part when these tests fixed on mac # TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j8 ctest --output-on-failure -j $1
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
pip install /usr/local/opt/paddle/share/wheels/*.whl pip install /usr/local/opt/paddle/share/wheels/*.whl
...@@ -629,10 +640,10 @@ EOF ...@@ -629,10 +640,10 @@ EOF
function gen_capi_package() { function gen_capi_package() {
if [[ ${WITH_C_API} == "ON" ]]; then if [[ ${WITH_C_API} == "ON" ]]; then
install_prefix="${PADDLE_ROOT}/build/capi_output" capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
rm -rf $install_prefix rm -rf $capi_install_prefix
make DESTDIR="$install_prefix" install make DESTDIR="$capi_install_prefix" install
cd $install_prefix/usr/local cd $capi_install_prefix/
ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
fi fi
} }
...@@ -729,7 +740,7 @@ function main() { ...@@ -729,7 +740,7 @@ function main() {
maccheck) maccheck)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build_mac build_mac
run_mac_test run_mac_test ${PROC_RUN:-1}
;; ;;
cicheck_py35) cicheck_py35)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
......
...@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
retry_limit = 3 retry_limit = 3
while not (os.path.exists(filename) and md5file(filename) == md5sum): while not (os.path.exists(filename) and md5file(filename) == md5sum):
if os.path.exists(filename): if os.path.exists(filename):
print("file md5", md5file(filename), md5sum) sys.stderr.write("file %s md5 %s" % (md5file(filename), md5sum))
if retry < retry_limit: if retry < retry_limit:
retry += 1 retry += 1
else: else:
raise RuntimeError("Cannot download {0} within retry limit {1}". raise RuntimeError("Cannot download {0} within retry limit {1}".
format(url, retry_limit)) format(url, retry_limit))
print("Cache file %s not found, downloading %s" % (filename, url)) sys.stderr.write("Cache file %s not found, downloading %s" %
(filename, url))
r = requests.get(url, stream=True) r = requests.get(url, stream=True)
total_length = r.headers.get('content-length') total_length = r.headers.get('content-length')
...@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
dl += len(data) dl += len(data)
f.write(data) f.write(data)
done = int(50 * dl / total_length) done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, sys.stderr.write("\r[%s%s]" % ('=' * done,
' ' * (50 - done))) ' ' * (50 - done)))
sys.stdout.flush() sys.stdout.flush()
sys.stderr.write("\n")
sys.stdout.flush()
return filename return filename
......
...@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size): ...@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size):
] ]
for name in names: for name in names:
for line in f.extractfile(name): for line in f.extractfile(name):
line_split = line.strip().split(six.b('\t')) line = cpt.to_text(line)
line_split = line.strip().split('\t')
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_seq = line_split[0] # one source sequence src_seq = line_split[0] # one source sequence
......
...@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
word_dict = defaultdict(int) word_dict = defaultdict(int)
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile("wmt16/train"): for line in f.extractfile("wmt16/train"):
line_split = line.strip().split(six.b("\t")) line = cpt.to_text(line)
line_split = line.strip().split("\t")
if len(line_split) != 2: continue if len(line_split) != 2: continue
sen = line_split[0] if lang == "en" else line_split[1] sen = line_split[0] if lang == "en" else line_split[1]
for w in sen.split(): for w in sen.split():
...@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): ...@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile(file_name): for line in f.extractfile(file_name):
line_split = line.strip().split(six.b("\t")) line = cpt.to_text(line)
line_split = line.strip().split("\t")
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_words = line_split[src_col].split() src_words = line_split[src_col].split()
......
...@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): ...@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
op_desc = op_descs[i] op_desc = op_descs[i]
if isinstance(op_desc, tuple): if isinstance(op_desc, tuple):
op_desc = op_desc[0] op_desc = op_desc[0]
op_desc.rename_input(old_name, new_name) op_desc._rename_input(old_name, new_name)
op_desc.rename_output(old_name, new_name) op_desc._rename_output(old_name, new_name)
def _create_op_desc_(op_type, inputs, outputs, attrs): def _create_op_desc_(op_type, inputs, outputs, attrs):
...@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): ...@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
if isinstance(val, framework.Block): if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc) op_desc.set_block_attr(name, val.desc)
else: else:
op_desc.set_attr(name, val) op_desc._set_attr(name, val)
return op_desc return op_desc
...@@ -346,7 +346,7 @@ def _append_backward_ops_(block, ...@@ -346,7 +346,7 @@ def _append_backward_ops_(block,
grad_sub_block_list = [] grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first # If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"): if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr_id("sub_block")) sub_block = program.block(op._block_attr_id("sub_block"))
grad_sub_block = program._create_block() grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op) cb = _callback_lookup_(op)
...@@ -382,7 +382,7 @@ def _append_backward_ops_(block, ...@@ -382,7 +382,7 @@ def _append_backward_ops_(block,
for op_desc in grad_op_descs: for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op() new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc) new_op_desc.copy_from(op_desc)
new_op_desc.set_attr(op_role_attr_name, backward) new_op_desc._set_attr(op_role_attr_name, backward)
grad_to_var["__current_op_desc__"] = new_op_desc grad_to_var["__current_op_desc__"] = new_op_desc
if callbacks is not None: if callbacks is not None:
assert (isinstance(callbacks, list)) assert (isinstance(callbacks, list))
...@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()): for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"): if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr_id("sub_block")) sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set() new_vars = set()
# create new gradient variables # create new gradient variables
...@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): ...@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
for name in op_desc.input_arg_names(): for name in op_desc.input_arg_names():
if name in var_map: if name in var_map:
op_desc.rename_input(name, var_map[name]) op_desc._rename_input(name, var_map[name])
for name in op_desc.output_arg_names(): for name in op_desc.output_arg_names():
if block.desc.find_var(name.encode("ascii")): if block.desc.find_var(name.encode("ascii")):
new_name = unique_name.generate(name) new_name = unique_name.generate(name)
op_desc.rename_output(name, new_name) op_desc._rename_output(name, new_name)
var_map[name] = new_name var_map[name] = new_name
for g, ng in six.iteritems(var_map): for g, ng in six.iteritems(var_map):
...@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
if loss.op is None: if loss.op is None:
raise ValueError("loss.op is None. Should not happend") raise ValueError("loss.op is None. Should not happend")
loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(), loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Forward) |
int(core.op_proto_and_checker_maker.OpRole.Loss)) int(core.op_proto_and_checker_maker.OpRole.Loss))
if callbacks is not None: if callbacks is not None:
isinstance(callbacks, list) isinstance(callbacks, list)
...@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
attr_val = [p.name, g.name] attr_val = [p.name, g.name]
if g.op.has_attr(op_role_var_attr_name): if g.op.has_attr(op_role_var_attr_name):
attr_val.extend(g.op.attr(op_role_var_attr_name)) attr_val.extend(g.op.attr(op_role_var_attr_name))
g.op.set_attr(op_role_var_attr_name, attr_val) g.op._set_attr(op_role_var_attr_name, attr_val)
return params_and_grads return params_and_grads
......
...@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr): ...@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr):
clip_op_desc.set_type("clip") clip_op_desc.set_type("clip")
clip_op_desc.set_input("X", [grad_name]) clip_op_desc.set_input("X", [grad_name])
clip_op_desc.set_output("Out", [grad_name]) clip_op_desc.set_output("Out", [grad_name])
clip_op_desc.set_attr("min", self.min) clip_op_desc._set_attr("min", self.min)
clip_op_desc.set_attr("max", self.max) clip_op_desc._set_attr("max", self.max)
def error_clip_callback(block, context): def error_clip_callback(block, context):
......
...@@ -18,5 +18,10 @@ from . import decoder ...@@ -18,5 +18,10 @@ from . import decoder
from .decoder import * from .decoder import *
from . import memory_usage_calc from . import memory_usage_calc
from .memory_usage_calc import * from .memory_usage_calc import *
from . import op_frequence
from .op_frequence import *
__all__ = decoder.__all__ + memory_usage_calc.__all__ __all__ = []
__all__ += decoder.__all__
__all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from collections import OrderedDict
from ..framework import Program
__all__ = ['op_freq_statistic']
def op_freq_statistic(program):
"""
Statistics of Op frequency.
Args:
program(Program): The current Program.
Returns:
uni_op_freq(dict): the single op frequency.
adj_2_op_freq(dict): the two adjacent ops frequency.
Examples:
>>> import paddle.fluid as fluid
>>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
>>> fluid.default_main_program())
>>> for op_type, op_num in uni_op_freq:
>>> print("%s \t %d" % (op_type, op_num))
>>> for op_type, op_num in adj_2_op_freq:
>>> print("%s \t %d" % (op_type, op_num))
"""
if not isinstance(program, Program):
raise TypeError("The input type should be Porgram."
"But you passed in %s" % (type(program)))
uni_op_freq = OrderedDict()
adj_2_op_freq = OrderedDict()
op_in_ops = OrderedDict()
parameters = [p.name for p in program.blocks[0].all_parameters()]
# get uni_op_freq
for op in program.global_block().ops:
had_recorded = False
for var_name in op.output_arg_names:
if var_name in parameters:
continue
if not had_recorded and uni_op_freq.has_key(op.type):
uni_op_freq[op.type] += 1
had_recorded = True
elif not had_recorded:
uni_op_freq[op.type] = 1
had_recorded = True
# get adj_2_op_freq
var_gen_op = {}
for op in program.global_block().ops:
for var_name in op.input_arg_names:
if var_name in parameters:
continue
if var_gen_op.has_key(var_name):
assert len(var_gen_op[var_name]) > 0
if op_in_ops.has_key(op.type):
op_in_ops[op.type].append(var_gen_op[var_name][-1])
else:
op_in_ops[op.type] = [var_gen_op[var_name][-1]]
else:
print("Var's generate op is not found,%s, %s" %
(var_name, op.type))
for var_name in op.output_arg_names:
if var_gen_op.has_key(var_name):
var_gen_op[var_name].append(op.type)
else:
var_gen_op[var_name] = [op.type]
for op, in_ops in op_in_ops.iteritems():
for in_op in in_ops:
op_op = in_op + "->" + op
if adj_2_op_freq.has_key(op_op):
adj_2_op_freq[op_op] += 1
else:
adj_2_op_freq[op_op] = 1
uni_op_freq = sorted(
uni_op_freq.items(), key=lambda item: item[1], reverse=True)
adj_2_op_freq = sorted(
adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
return uni_op_freq, adj_2_op_freq
...@@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None ...@@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None
__all__ = [ __all__ = [
'Program', 'Program',
'Operator',
'default_startup_program', 'default_startup_program',
'default_main_program', 'default_main_program',
'program_guard', 'program_guard',
'get_var',
'name_scope', 'name_scope',
] ]
...@@ -663,11 +661,11 @@ class Operator(object): ...@@ -663,11 +661,11 @@ class Operator(object):
self._update_desc_attr(attr_name, attr_val) self._update_desc_attr(attr_name, attr_val)
self.desc.check_attrs() self.desc.check_attrs()
if self.has_kernel(type): if self._has_kernel(type):
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc) self.desc.infer_shape(self.block.desc)
def has_kernel(self, op_type): def _has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET return op_type not in self.OP_WITHOUT_KERNEL_SET
def to_string(self, throw_on_error): def to_string(self, throw_on_error):
...@@ -708,7 +706,7 @@ class Operator(object): ...@@ -708,7 +706,7 @@ class Operator(object):
""" """
return self.desc.input(name) return self.desc.input(name)
def rename_input(self, old_name, new_name): def _rename_input(self, old_name, new_name):
""" """
Rename the `old_name` to `new_name`. Rename the `old_name` to `new_name`.
...@@ -719,9 +717,9 @@ class Operator(object): ...@@ -719,9 +717,9 @@ class Operator(object):
Returns: Returns:
None None
""" """
self.desc.rename_input(old_name, new_name) self.desc._rename_input(old_name, new_name)
def rename_output(self, old_name, new_name): def _rename_output(self, old_name, new_name):
""" """
Rename the `old_name` to `new_name`. Rename the `old_name` to `new_name`.
...@@ -732,7 +730,7 @@ class Operator(object): ...@@ -732,7 +730,7 @@ class Operator(object):
Returns: Returns:
None None
""" """
self.desc.rename_output(old_name, new_name) self.desc._rename_output(old_name, new_name)
@property @property
def input_names(self): def input_names(self):
...@@ -796,7 +794,7 @@ class Operator(object): ...@@ -796,7 +794,7 @@ class Operator(object):
""" """
return self.desc.attr_type(name) return self.desc.attr_type(name)
def set_attr(self, name, val): def _set_attr(self, name, val):
""" """
Set the value of attribute by attribute's name. Set the value of attribute by attribute's name.
...@@ -829,7 +827,7 @@ class Operator(object): ...@@ -829,7 +827,7 @@ class Operator(object):
isinstance(val, core.ProgramDesc): isinstance(val, core.ProgramDesc):
self.desc.set_serialized_attr(name, val.serialize_to_string()) self.desc.set_serialized_attr(name, val.serialize_to_string())
else: else:
self.desc.set_attr(name, val) self.desc._set_attr(name, val)
@property @property
def attr_names(self): def attr_names(self):
...@@ -848,7 +846,7 @@ class Operator(object): ...@@ -848,7 +846,7 @@ class Operator(object):
""" """
return self.desc.attr(name) return self.desc.attr(name)
def block_attr_id(self, name): def _block_attr_id(self, name):
""" """
Get the block attribute's id by name. Get the block attribute's id by name.
...@@ -858,9 +856,9 @@ class Operator(object): ...@@ -858,9 +856,9 @@ class Operator(object):
Returns: Returns:
int: the block index. int: the block index.
""" """
return self.desc.block_attr_id(name) return self.desc._block_attr_id(name)
def block_attr(self, name): def _block_attr(self, name):
""" """
Get the block attribute by name. Get the block attribute by name.
...@@ -871,11 +869,11 @@ class Operator(object): ...@@ -871,11 +869,11 @@ class Operator(object):
block: the block attribute. block: the block attribute.
""" """
id = self.block_attr_id(name) id = self._block_attr_id(name)
assert (id >= 0 and id < len(self.block.program.blocks)) assert (id >= 0 and id < len(self.block.program.blocks))
return self.block.program.blocks[id] return self.block.program.blocks[id]
def blocks_attr(self, name): def _blocks_attr(self, name):
""" """
Get the blocks attribute by name. Get the blocks attribute by name.
...@@ -886,13 +884,13 @@ class Operator(object): ...@@ -886,13 +884,13 @@ class Operator(object):
list: list of the blocks attribute. list: list of the blocks attribute.
""" """
attrs = [] attrs = []
for i in self.blocks_attr_ids(name): for i in self._blocks_attr_ids(name):
assert (i >= 0 and i < len(self.block.program.blocks)) assert (i >= 0 and i < len(self.block.program.blocks))
attrs.append(self.block.program.blocks[i]) attrs.append(self.block.program.blocks[i])
return attrs return attrs
def blocks_attr_ids(self, name): def _blocks_attr_ids(self, name):
""" """
Get the blocks attribute's ids by name. Get the blocks attribute's ids by name.
...@@ -903,7 +901,7 @@ class Operator(object): ...@@ -903,7 +901,7 @@ class Operator(object):
list: list of the blocks ids. list: list of the blocks ids.
""" """
return self.desc.blocks_attr_ids(name) return self.desc._blocks_attr_ids(name)
def all_attrs(self): def all_attrs(self):
""" """
...@@ -917,11 +915,11 @@ class Operator(object): ...@@ -917,11 +915,11 @@ class Operator(object):
for n in attr_names: for n in attr_names:
attr_type = self.desc.attr_type(n) attr_type = self.desc.attr_type(n)
if attr_type == core.AttrType.BLOCK: if attr_type == core.AttrType.BLOCK:
attr_map[n] = self.block_attr(n) attr_map[n] = self._block_attr(n)
continue continue
if attr_type == core.AttrType.BLOCKS: if attr_type == core.AttrType.BLOCKS:
attr_map[n] = self.blocks_attr(n) attr_map[n] = self._blocks_attr(n)
continue continue
attr_map[n] = self.attr(n) attr_map[n] = self.attr(n)
...@@ -1795,7 +1793,7 @@ class Program(object): ...@@ -1795,7 +1793,7 @@ class Program(object):
for j in six.moves.range(block.op_size()): for j in six.moves.range(block.op_size()):
op = block.op(j) op = block.op(j)
if op.has_attr('is_test'): if op.has_attr('is_test'):
op.set_attr('is_test', True) op._set_attr('is_test', True)
res.blocks = [ res.blocks = [
Block(res, i) for i in six.moves.range(res.desc.num_blocks()) Block(res, i) for i in six.moves.range(res.desc.num_blocks())
] ]
...@@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None): ...@@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None):
switch_startup_program(startup_program) switch_startup_program(startup_program)
def get_var(name, program=None): def _get_var(name, program=None):
""" """
Get a variable by name from the global block of a program. Get a variable by name from the global block of a program.
......
...@@ -600,7 +600,7 @@ def save_inference_model(dirname, ...@@ -600,7 +600,7 @@ def save_inference_model(dirname,
""" """
if isinstance(feeded_var_names, six.string_types): if isinstance(feeded_var_names, six.string_types):
feeded_var_names = [feeded_var_names] feeded_var_names = [feeded_var_names]
else: elif export_for_deployment:
if len(feeded_var_names) > 0: if len(feeded_var_names) > 0:
# TODO(paddle-dev): polish these code blocks # TODO(paddle-dev): polish these code blocks
if not (bool(feeded_var_names) and all( if not (bool(feeded_var_names) and all(
...@@ -610,61 +610,60 @@ def save_inference_model(dirname, ...@@ -610,61 +610,60 @@ def save_inference_model(dirname,
if isinstance(target_vars, Variable): if isinstance(target_vars, Variable):
target_vars = [target_vars] target_vars = [target_vars]
else: elif export_for_deployment:
if not (bool(target_vars) and all( if not (bool(target_vars) and all(
isinstance(var, Variable) for var in target_vars)): isinstance(var, Variable) for var in target_vars)):
raise ValueError("'target_vars' should be a list of Variable.") raise ValueError("'target_vars' should be a list of Variable.")
if main_program is None: if main_program is None:
main_program = default_main_program() main_program = default_main_program()
copy_program = main_program.clone()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
os.makedirs(dirname) os.makedirs(dirname)
if model_filename is not None:
model_basename = os.path.basename(model_filename)
else:
model_basename = "__model__"
model_basename = os.path.join(dirname, model_basename)
# When export_for_deployment is true, we modify the program online so that # When export_for_deployment is true, we modify the program online so that
# it can only be loaded for inference directly. If it's false, the whole # it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be # original program and related meta are saved so that future usage can be
# more flexible. # more flexible.
if export_for_deployment: if export_for_deployment:
global_block = copy_program.global_block() main_program = main_program.clone()
global_block = main_program.global_block()
for i, op in enumerate(global_block.ops): for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False) op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch": if op.type == "feed" or op.type == "fetch":
global_block._remove_op(i) global_block._remove_op(i)
copy_program.desc.flush() main_program.desc.flush()
pruned_program = copy_program._prune(targets=target_vars) main_program = main_program._prune(targets=target_vars)
saved_program = pruned_program._inference_optimize(prune_read_op=True) main_program = main_program._inference_optimize(prune_read_op=True)
fetch_var_names = [v.name for v in target_vars] fetch_var_names = [v.name for v in target_vars]
prepend_feed_ops(saved_program, feeded_var_names) prepend_feed_ops(main_program, feeded_var_names)
append_fetch_ops(saved_program, fetch_var_names) append_fetch_ops(main_program, fetch_var_names)
with open(model_basename, "wb") as f:
f.write(main_program.desc.serialize_to_string())
else: else:
# TODO(panyx0718): Save more information so that it can also be used # TODO(panyx0718): Save more information so that it can also be used
# for training and more flexible post-processing. # for training and more flexible post-processing.
saved_program = copy_program with open(model_basename + ".main_program", "wb") as f:
f.write(main_program.desc.serialize_to_string())
if model_filename is not None:
model_filename = os.path.basename(model_filename)
else:
model_filename = "__model__"
model_filename = os.path.join(dirname, model_filename)
if params_filename is not None: if params_filename is not None:
params_filename = os.path.basename(params_filename) params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename)
with open(model_filename, "wb") as f:
f.write(saved_program.desc.serialize_to_string())
save_persistables(executor, dirname, saved_program, params_filename)
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
def load_inference_model(dirname, def load_inference_model(dirname,
......
...@@ -284,7 +284,7 @@ def detection_output(loc, ...@@ -284,7 +284,7 @@ def detection_output(loc,
target_box=loc, target_box=loc,
code_type='decode_center_size') code_type='decode_center_size')
compile_shape = scores.shape compile_shape = scores.shape
run_shape = ops.shape(scores) run_shape = nn.shape(scores)
scores = nn.flatten(x=scores, axis=2) scores = nn.flatten(x=scores, axis=2)
scores = nn.softmax(input=scores) scores = nn.softmax(input=scores)
scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
...@@ -697,7 +697,7 @@ def ssd_loss(location, ...@@ -697,7 +697,7 @@ def ssd_loss(location,
raise ValueError("Only support mining_type == max_negative now.") raise ValueError("Only support mining_type == max_negative now.")
num, num_prior, num_class = confidence.shape num, num_prior, num_class = confidence.shape
conf_shape = ops.shape(confidence) conf_shape = nn.shape(confidence)
def __reshape_to_2d(var): def __reshape_to_2d(var):
return nn.flatten(x=var, axis=2) return nn.flatten(x=var, axis=2)
...@@ -724,7 +724,7 @@ def ssd_loss(location, ...@@ -724,7 +724,7 @@ def ssd_loss(location,
target_label.stop_gradient = True target_label.stop_gradient = True
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples # 3. Mining hard examples
actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2]) actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
actual_shape.stop_gradient = True actual_shape.stop_gradient = True
conf_loss = nn.reshape( conf_loss = nn.reshape(
x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
......
...@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): def auc(input,
label,
curve='ROC',
num_thresholds=2**12 - 1,
topk=1,
slide_steps=1):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): ...@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
num_thresholds(int): The number of thresholds to use when discretizing num_thresholds(int): The number of thresholds to use when discretizing
the roc curve. Default 200. the roc curve. Default 200.
topk(int): only topk number of prediction output will be used for auc. topk(int): only topk number of prediction output will be used for auc.
slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
Returns: Returns:
Variable: A scalar representing the current AUC. Variable: A scalar representing the current AUC.
...@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): ...@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_tmp_variable(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64") batch_auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
# for batch auc
batch_stat_pos = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[slide_steps, num_thresholds + 1])
batch_stat_neg = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[slide_steps, num_thresholds + 1])
# for global auc
stat_pos = helper.create_global_variable( stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds + 1]) persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
stat_neg = helper.create_global_variable( stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds + 1]) persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
for var in [stat_pos, stat_neg]: for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
helper.set_variable_initializer( helper.set_variable_initializer(
var, Constant( var, Constant(
value=0.0, force_cpu=True)) value=0.0, force_cpu=True))
# Batch AUC
helper.append_op(
type="auc",
inputs={
"Predict": [input],
"Label": [label],
"StatPos": [batch_stat_pos],
"StatNeg": [batch_stat_neg]
},
attrs={
"curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": slide_steps
},
outputs={
"AUC": [batch_auc_out],
"StatPosOut": [batch_stat_pos],
"StatNegOut": [batch_stat_neg]
})
# Global AUC
helper.append_op( helper.append_op(
type="auc", type="auc",
inputs={ inputs={
...@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): ...@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
"StatPos": [stat_pos], "StatPos": [stat_pos],
"StatNeg": [stat_neg] "StatNeg": [stat_neg]
}, },
attrs={"curve": curve, attrs={
"num_thresholds": num_thresholds}, "curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": 0
},
outputs={ outputs={
"AUC": [auc_out], "AUC": [auc_out],
"BatchAUC": [batch_auc_out],
"StatPosOut": [stat_pos], "StatPosOut": [stat_pos],
"StatNegOut": [stat_neg] "StatNegOut": [stat_neg]
}) })
return auc_out, batch_auc_out, [stat_pos, stat_neg] return auc_out, batch_auc_out, [
batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
]
...@@ -50,8 +50,10 @@ __all__ = [ ...@@ -50,8 +50,10 @@ __all__ = [
'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate', 'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate',
'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div', 'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div',
'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min', 'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
'elementwise_pow', 'logical_and', 'logical_or', 'logical_xor', 'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random',
'logical_not', 'clip', 'clip_by_norm' 'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape',
'logical_and', 'logical_or', 'logical_xor', 'logical_not', 'clip',
'clip_by_norm'
] ]
...@@ -6382,6 +6384,246 @@ def expand(x, expand_times, name=None): ...@@ -6382,6 +6384,246 @@ def expand(x, expand_times, name=None):
return out return out
from paddle.fluid.framework import convert_np_dtype_to_dtype_
@templatedoc()
def uniform_random_batch_size_like(input,
shape,
dtype='float32',
input_dim_idx=0,
output_dim_idx=0,
min=-1.0,
max=1.0,
seed=0):
"""
${comment}
Args:
input (Variable): ${input_comment}
shape (tuple|list): ${shape_comment}
input_dim_idx (Int): ${input_dim_idx_comment}
output_dim_idx (Int): ${output_dim_idx_comment}
min (Float): ${min_comment}
max (Float): ${max_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('uniform_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='uniform_random_batch_size_like',
inputs={'Input': input},
outputs={'Out': out},
attrs={
'shape': shape,
'input_dim_idx': input_dim_idx,
'output_dim_idx': output_dim_idx,
'min': min,
'max': max,
'seed': seed,
'dtype': c_dtype
})
return out
@templatedoc()
def gaussian_random(shape,
mean=0.0,
std=1.0,
seed=0,
dtype='float32',
use_mkldnn=False):
"""
${comment}
Args:
shape (tuple|list): ${shape_comment}
mean (Float): ${mean_comment}
std (Float): ${std_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
use_mkldnn (Bool): Only used in mkldnn kernel.
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('gaussian_random', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='gaussian_random',
outputs={'Out': out},
attrs={
'shape': shape,
'mean': mean,
'std': std,
'seed': seed,
'dtype': c_dtype,
'use_mkldnn': use_mkldnn
})
return out
@templatedoc()
def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
"""
${comment}
Args:
x (Variable): ${x_comment}
min (Float): ${min_comment}
max (Float): ${max_comment}
seed (Float): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('sampling_id', **locals())
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='sampling_id',
inputs={'X': x},
outputs={'Out': out},
attrs={'min': min,
'max': max,
'seed': seed})
return out
@templatedoc()
def gaussian_random_batch_size_like(input,
shape,
input_dim_idx=0,
output_dim_idx=0,
mean=0.0,
std=1.0,
seed=0,
dtype='float32'):
"""
${comment}
Args:
input (Variable): ${input_comment}
shape (tuple|list): ${shape_comment}
input_dim_idx (Int): ${input_dim_idx_comment}
output_dim_idx (Int): ${output_dim_idx_comment}
mean (Float): ${mean_comment}
std (Float): ${std_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('gaussian_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='gaussian_random_batch_size_like',
inputs={'Input': input},
outputs={'Out': out},
attrs={
'shape': shape,
'input_dim_idx': input_dim_idx,
'output_dim_idx': output_dim_idx,
'mean': mean,
'std': std,
'seed': seed,
'dtype': c_dtype
})
return out
@templatedoc()
def sum(x, use_mkldnn=False):
"""
${comment}
Args:
x (Variable): ${x_comment}
use_mkldnn (Bool): ${use_mkldnn_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('sum', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
helper.append_op(
type='sum',
inputs={'X': x},
outputs={'Out': out},
attrs={'use_mkldnn': use_mkldnn})
return out
@templatedoc()
def slice(input, axes, starts, ends):
"""
${comment}
Args:
input (Variable): ${input_comment}.
axes (List): ${axes_comment}
starts (List): ${starts_comment}
ends (List): ${ends_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('slice', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
helper.append_op(
type='slice',
inputs={'Input': input},
outputs={'Out': out},
attrs={'axes': axes,
'starts': starts,
'ends': ends})
return out
@templatedoc()
def shape(input):
"""
${comment}
Args:
input (Variable): ${input_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('shape', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
helper.append_op(
type='shape', inputs={'Input': input}, outputs={'Out': out})
return out
def _elementwise_op(helper): def _elementwise_op(helper):
op_type = helper.layer_type op_type = helper.layer_type
x = helper.kwargs.get('x', None) x = helper.kwargs.get('x', None)
......
...@@ -39,13 +39,6 @@ __all__ = [ ...@@ -39,13 +39,6 @@ __all__ = [
'mean', 'mean',
'mul', 'mul',
'sigmoid_cross_entropy_with_logits', 'sigmoid_cross_entropy_with_logits',
'uniform_random_batch_size_like',
'gaussian_random',
'sampling_id',
'gaussian_random_batch_size_like',
'sum',
'slice',
'shape',
'maxout', 'maxout',
] ]
...@@ -57,6 +50,8 @@ for _OP in set(__all__): ...@@ -57,6 +50,8 @@ for _OP in set(__all__):
# e.g.: test_program_code.py, test_dist_train.py # e.g.: test_program_code.py, test_dist_train.py
globals()['_scale'] = generate_layer_fn('scale') globals()['_scale'] = generate_layer_fn('scale')
globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
__all__ += __activations_noattr__ __all__ += __activations_noattr__
for _OP in set(__activations_noattr__): for _OP in set(__activations_noattr__):
......
...@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper ...@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper
from .regularizer import append_regularization_ops from .regularizer import append_regularization_ops
from .clip import append_gradient_clip_ops, error_clip_callback from .clip import append_gradient_clip_ops, error_clip_callback
from contextlib import contextmanager from contextlib import contextmanager
from .layers import ops
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
...@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer): ...@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer):
x=tmp, dtype='float32' if self._dtype == None else self._dtype) x=tmp, dtype='float32' if self._dtype == None else self._dtype)
sum = layers.cast( sum = layers.cast(
x=sum, dtype='float32' if self._dtype == None else self._dtype) x=sum, dtype='float32' if self._dtype == None else self._dtype)
layers.elementwise_div(x=sum, y=tmp, out=param) ops._elementwise_div(x=sum, y=tmp, out=param)
def _add_average_restore_op(self, block, param_grad): def _add_average_restore_op(self, block, param_grad):
param = block._clone_variable(param_grad[0]) param = block._clone_variable(param_grad[0])
......
...@@ -67,6 +67,7 @@ def train(nn_type, ...@@ -67,6 +67,7 @@ def train(nn_type,
use_cuda, use_cuda,
parallel, parallel,
save_dirname=None, save_dirname=None,
save_full_dirname=None,
model_filename=None, model_filename=None,
params_filename=None, params_filename=None,
is_local=True): is_local=True):
...@@ -143,6 +144,13 @@ def train(nn_type, ...@@ -143,6 +144,13 @@ def train(nn_type,
exe, exe,
model_filename=model_filename, model_filename=model_filename,
params_filename=params_filename) params_filename=params_filename)
if save_full_dirname is not None:
fluid.io.save_inference_model(
save_full_dirname, [], [],
exe,
model_filename=model_filename,
params_filename=params_filename,
export_for_deployment=False)
return return
else: else:
print( print(
...@@ -214,10 +222,12 @@ def infer(use_cuda, ...@@ -214,10 +222,12 @@ def infer(use_cuda,
def main(use_cuda, parallel, nn_type, combine): def main(use_cuda, parallel, nn_type, combine):
save_dirname = None save_dirname = None
save_full_dirname = None
model_filename = None model_filename = None
params_filename = None params_filename = None
if not use_cuda and not parallel: if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model" save_dirname = "recognize_digits_" + nn_type + ".inference.model"
save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
if combine == True: if combine == True:
model_filename = "__model_combined__" model_filename = "__model_combined__"
params_filename = "__params_combined__" params_filename = "__params_combined__"
...@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine): ...@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine):
use_cuda=use_cuda, use_cuda=use_cuda,
parallel=parallel, parallel=parallel,
save_dirname=save_dirname, save_dirname=save_dirname,
save_full_dirname=save_full_dirname,
model_filename=model_filename, model_filename=model_filename,
params_filename=params_filename) params_filename=params_filename)
infer( infer(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import dist_ctr_reader
from test_dist_base import TestDistRunnerBase, runtime_main
IS_SPARSE = True
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
class TestDistCTR2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
""" network definition """
dnn_data = fluid.layers.data(
name="dnn_data",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
lr_data = fluid.layers.data(
name="lr_data",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
label = fluid.layers.data(
name="click",
shape=[-1, 1],
dtype="int64",
lod_level=0,
append_batch_size=False)
# build dnn model
dnn_layer_dims = [128, 64, 32, 1]
dnn_embedding = fluid.layers.embedding(
is_distributed=False,
input=dnn_data,
size=[dnn_input_dim, dnn_layer_dims[0]],
param_attr=fluid.ParamAttr(
name="deep_embedding",
initializer=fluid.initializer.Constant(value=0.01)),
is_sparse=IS_SPARSE)
dnn_pool = fluid.layers.sequence_pool(
input=dnn_embedding, pool_type="sum")
dnn_out = dnn_pool
for i, dim in enumerate(dnn_layer_dims[1:]):
fc = fluid.layers.fc(
input=dnn_out,
size=dim,
act="relu",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)),
name='dnn-fc-%d' % i)
dnn_out = fc
# build lr model
lr_embbding = fluid.layers.embedding(
is_distributed=False,
input=lr_data,
size=[lr_input_dim, 1],
param_attr=fluid.ParamAttr(
name="wide_embedding",
initializer=fluid.initializer.Constant(value=0.01)),
is_sparse=IS_SPARSE)
lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
acc = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
label=label)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
inference_program = paddle.fluid.default_main_program().clone()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
sgd_optimizer.minimize(avg_cost)
dataset = dist_ctr_reader.Dataset()
train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
return inference_program, avg_cost, train_reader, test_reader, None, predict
if __name__ == "__main__":
runtime_main(TestDistCTR2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import paddle
import tarfile
logging.basicConfig()
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
"""
avazu_ctr_data/train.txt
avazu_ctr_data/infer.txt
avazu_ctr_data/test.txt
avazu_ctr_data/data.meta.txt
"""
def read_data(file_name):
path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
tar = tarfile.open(path, "r:gz")
tar_info = None
for member in tar.getmembers():
if member.name.endswith(file_name):
tar_info = member
f = tar.extractfile(tar_info)
ret_lines = [_.decode('utf-8') for _ in f.readlines()]
return ret_lines
class TaskMode:
TRAIN_MODE = 0
TEST_MODE = 1
INFER_MODE = 2
def __init__(self, mode):
self.mode = mode
def is_train(self):
return self.mode == self.TRAIN_MODE
def is_test(self):
return self.mode == self.TEST_MODE
def is_infer(self):
return self.mode == self.INFER_MODE
@staticmethod
def create_train():
return TaskMode(TaskMode.TRAIN_MODE)
@staticmethod
def create_test():
return TaskMode(TaskMode.TEST_MODE)
@staticmethod
def create_infer():
return TaskMode(TaskMode.INFER_MODE)
class ModelType:
CLASSIFICATION = 0
REGRESSION = 1
def __init__(self, mode):
self.mode = mode
def is_classification(self):
return self.mode == self.CLASSIFICATION
def is_regression(self):
return self.mode == self.REGRESSION
@staticmethod
def create_classification():
return ModelType(ModelType.CLASSIFICATION)
@staticmethod
def create_regression():
return ModelType(ModelType.REGRESSION)
def load_dnn_input_record(sent):
return list(map(int, sent.split()))
def load_lr_input_record(sent):
res = []
for _ in [x.split(':') for x in sent.split()]:
res.append(int(_[0]))
return res
feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
class Dataset(object):
def train(self):
'''
Load trainset.
'''
file_name = "train.txt"
logger.info("load trainset from %s" % file_name)
mode = TaskMode.create_train()
return self._parse_creator(file_name, mode)
def test(self):
'''
Load testset.
'''
file_name = "test.txt"
logger.info("load testset from %s" % file_name)
mode = TaskMode.create_test()
return self._parse_creator(file_name, mode)
def infer(self):
'''
Load infer set.
'''
file_name = "infer.txt"
logger.info("load inferset from %s" % file_name)
mode = TaskMode.create_infer()
return self._parse_creator(file_name, mode)
def _parse_creator(self, file_name, mode):
'''
Parse dataset.
'''
def _parse():
data = read_data(file_name)
for line_id, line in enumerate(data):
fs = line.strip().split('\t')
dnn_input = load_dnn_input_record(fs[0])
lr_input = load_lr_input_record(fs[1])
if not mode.is_infer():
click = int(fs[2])
yield [dnn_input, lr_input, click]
else:
yield [dnn_input, lr_input]
return _parse
def load_data_meta():
'''
load data meta info from path, return (dnn_input_dim, lr_input_dim)
'''
lines = read_data('data.meta.txt')
err_info = "wrong meta format"
assert len(lines) == 2, err_info
assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
1], err_info
res = map(int, [_.split(':')[1] for _ in lines])
res = list(res)
logger.info('dnn input dim: %d' % res[0])
logger.info('lr input dim: %d' % res[1])
return res
...@@ -47,7 +47,7 @@ def cnn_model(data): ...@@ -47,7 +47,7 @@ def cnn_model(data):
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.3))) value=0.01)))
conv_pool_2 = fluid.nets.simple_img_conv_pool( conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
...@@ -56,7 +56,7 @@ def cnn_model(data): ...@@ -56,7 +56,7 @@ def cnn_model(data):
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.2))) value=0.01)))
SIZE = 10 SIZE = 10
input_shape = conv_pool_2.shape input_shape = conv_pool_2.shape
...@@ -68,7 +68,7 @@ def cnn_model(data): ...@@ -68,7 +68,7 @@ def cnn_model(data):
size=SIZE, size=SIZE,
act="softmax", act="softmax",
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1))) initializer=fluid.initializer.Constant(value=0.01)))
return predict return predict
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "int64"
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
# For Net
base_lr = 0.2
emb_lr = base_lr * 3
dict_dim = 1500
emb_dim = 128
hid_dim = 128
margin = 0.1
sample_rate = 1
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
def get_acc(cos_q_nt, cos_q_pt, batch_size):
cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
cond = fluid.layers.cast(cond, dtype='float64')
cond_3 = fluid.layers.reduce_sum(cond)
acc = fluid.layers.elementwise_div(
cond_3,
fluid.layers.fill_constant(
shape=[1], value=batch_size * 1.0, dtype='float64'),
name="simnet_acc")
return acc
def get_loss(cos_q_pt, cos_q_nt):
loss_op1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
cos_q_pt)
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_op2)
avg_cost = fluid.layers.mean(loss_op3)
return avg_cost
def get_optimizer():
# SGD optimizer
optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
return optimizer
def train_network(batch_size, is_distributed=False, is_sparse=False):
# query
q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
q_emb = fluid.layers.embedding(
input=q,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
q_ss = fluid.layers.softsign(q_sum)
## fc layer after conv
q_fc = fluid.layers.fc(
input=q_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__q_fc__",
learning_rate=base_lr))
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
# pt
pt = fluid.layers.data(
name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
pt_emb = fluid.layers.embedding(
input=pt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
pt_ss = fluid.layers.softsign(pt_sum)
## fc layer
pt_fc = fluid.layers.fc(
input=pt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
# nt
nt = fluid.layers.data(
name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
nt_emb = fluid.layers.embedding(
input=nt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
nt_ss = fluid.layers.softsign(nt_sum)
## fc layer
nt_fc = fluid.layers.fc(
input=nt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
# loss
avg_cost = get_loss(cos_q_pt, cos_q_nt)
# acc
acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
return [avg_cost, acc, cos_q_pt]
def combination(x, y):
res = [[[xi, yi] for yi in y] for xi in x]
return res[0]
def get_one_data(file_list):
for file in file_list:
contents = []
with open(file, "r") as fin:
for i in fin:
contents.append(i.strip())
for index, q in enumerate(contents):
try:
one_data = [[int(j) for j in i.split(" ")]
for i in q.split(";")[:-1]]
if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
q = fin.readline()
continue
tmp = combination(one_data[3:3 + one_data[1][0]],
one_data[3 + one_data[1][0]:])
except Exception as e:
continue
for each in tmp:
yield [one_data[2], 0, each[0], each[1]]
def get_batch_reader(file_list, batch_size):
def batch_reader():
res = []
for i in get_one_data(file_list):
if random.random() <= sample_rate:
res.append(i)
if len(res) >= batch_size:
yield res
res = []
return batch_reader
def get_train_reader(batch_size):
# The training data set.
train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
"train")
train_reader = get_batch_reader([train_file], batch_size)
train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
return train_reader, train_feed
class TestDistSimnetBow2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
# Train program
avg_cost, acc, predict = \
train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
inference_program = fluid.default_main_program().clone()
# Optimization
opt = get_optimizer()
opt.minimize(avg_cost)
# Reader
train_reader, _ = get_train_reader(batch_size)
return inference_program, avg_cost, train_reader, train_reader, acc, predict
if __name__ == "__main__":
paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
runtime_main(TestDistSimnetBow2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
import six
import tarfile
import string
import re
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "float32"
VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
# Load dictionary.
def load_vocab(filename):
vocab = {}
if six.PY2:
with open(filename, 'r') as f:
for idx, line in enumerate(f):
vocab[line.strip()] = idx
else:
with open(filename, 'r', encoding="utf-8") as f:
for idx, line in enumerate(f):
vocab[line.strip()] = idx
return vocab
def get_worddict(dict_path):
word_dict = load_vocab(dict_path)
word_dict["<unk>"] = len(word_dict)
dict_dim = len(word_dict)
return word_dict, dict_dim
def conv_net(input,
dict_dim,
emb_dim=128,
window_size=3,
num_filters=128,
fc0_dim=96,
class_dim=2):
emb = fluid.layers.embedding(
input=input,
size=[dict_dim, emb_dim],
is_sparse=False,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
conv_3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=num_filters,
filter_size=window_size,
act="tanh",
pool_type="max",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
fc_0 = fluid.layers.fc(
input=[conv_3],
size=fc0_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
prediction = fluid.layers.fc(
input=[fc_0],
size=class_dim,
act="softmax",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
return prediction
def inference_network(dict_dim):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
out = conv_net(data, dict_dim)
return out
def get_reader(word_dict, batch_size):
# The training data set.
train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
# The testing data set.
test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
return train_reader, test_reader
def get_optimizer(learning_rate):
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
return optimizer
class TestDistTextClassification2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
vocab = os.path.join(paddle.dataset.common.DATA_HOME,
"text_classification", "imdb.vocab")
word_dict, dict_dim = get_worddict(vocab)
# Input data
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = conv_net(data, dict_dim)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = get_optimizer(learning_rate=0.001)
opt.minimize(avg_cost)
# Reader
train_reader, test_reader = get_reader(word_dict, batch_size)
return inference_program, avg_cost, train_reader, test_reader, acc, predict
def tokenize(pattern):
"""
Read files that match the given pattern. Tokenize and yield each file.
"""
with tarfile.open(
paddle.dataset.common.download(DATA_URL, 'text_classification',
DATA_MD5)) as tarf:
# Note that we should use tarfile.next(), which does
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf = tarf.next()
while tf != None:
if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip(six.b(
"\n\r")).translate(
None, six.b(string.punctuation)).lower().split()
tf = tarf.next()
def reader_creator(pos_pattern, neg_pattern, word_idx):
UNK = word_idx['<unk>']
INS = []
def load(pattern, out, label):
for doc in tokenize(pattern):
out.append(([word_idx.get(w, UNK) for w in doc], label))
load(pos_pattern, INS, 0)
load(neg_pattern, INS, 1)
def reader():
for doc, label in INS:
yield doc, label
return reader
def train(word_idx):
"""
IMDB training set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
re.compile("train/pos/.*\.txt$"),
re.compile("train/neg/.*\.txt$"), word_idx)
def test(word_idx):
"""
IMDB test set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
"""
return reader_creator(
re.compile("test/pos/.*\.txt$"),
re.compile("test/neg/.*\.txt$"), word_idx)
if __name__ == "__main__":
paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
runtime_main(TestDistTextClassification2x2)
...@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size, ...@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size,
if weight_sharing: if weight_sharing:
predict = layers.matmul( predict = layers.matmul(
x=dec_output, x=dec_output,
y=fluid.get_var(word_emb_param_names[0]), y=fluid.framework._get_var(word_emb_param_names[0]),
transpose_y=True) transpose_y=True)
else: else:
predict = layers.fc(input=dec_output, predict = layers.fc(input=dec_output,
...@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase): ...@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
exe.run(startup_prog) exe.run(startup_prog)
exe.run(pserver_prog) exe.run(pserver_prog)
def run_trainer(self, use_cuda, args): def run_trainer(self, args):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() TrainTaskConfig.use_gpu = args.use_cuda
TrainTaskConfig.use_gpu = use_cuda sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
args.is_dist, not args.sync_mode) args.is_dist, not args.sync_mode)
if args.is_dist: if args.is_dist:
...@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase): ...@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
TrainTaskConfig.batch_size = 20 TrainTaskConfig.batch_size = 20
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
TrainTaskConfig.local = not args.is_dist TrainTaskConfig.local = not args.is_dist
......
...@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase): ...@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
if __name__ == "__main__": if __name__ == "__main__":
import os
os.environ['CPU_NUM'] = '1'
os.environ['USE_CUDA'] = "FALSE"
runtime_main(TestDistWord2vec2x2) runtime_main(TestDistWord2vec2x2)
...@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): ...@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t)) str(actual_t) + " in class " + self.__class__.__name__)
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
......
...@@ -36,7 +36,11 @@ class TestAucOp(OpTest): ...@@ -36,7 +36,11 @@ class TestAucOp(OpTest):
"StatPos": stat_pos, "StatPos": stat_pos,
"StatNeg": stat_neg "StatNeg": stat_neg
} }
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} self.attrs = {
'curve': 'ROC',
'num_thresholds': num_thresholds,
"slide_steps": 1
}
python_auc = metrics.Auc(name="auc", python_auc = metrics.Auc(name="auc",
curve='ROC', curve='ROC',
...@@ -45,7 +49,6 @@ class TestAucOp(OpTest): ...@@ -45,7 +49,6 @@ class TestAucOp(OpTest):
self.outputs = { self.outputs = {
'AUC': np.array(python_auc.eval()), 'AUC': np.array(python_auc.eval()),
'BatchAUC': np.array(python_auc.eval()),
'StatPosOut': np.array(python_auc._stat_pos), 'StatPosOut': np.array(python_auc._stat_pos),
'StatNegOut': np.array(python_auc._stat_neg) 'StatNegOut': np.array(python_auc._stat_neg)
} }
......
...@@ -20,6 +20,7 @@ import six ...@@ -20,6 +20,7 @@ import six
import sys import sys
import collections import collections
import math import math
import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
...@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): ...@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
self.detect = np.array(self.detect).astype('float32') self.detect = np.array(self.detect).astype('float32')
self.mAP = np.array(self.mAP).astype('float32') self.mAP = np.array(self.mAP).astype('float32')
if (len(self.class_pos_count) > 0): if len(self.class_pos_count) > 0:
self.class_pos_count = np.array(self.class_pos_count).astype( self.class_pos_count = np.array(self.class_pos_count).astype(
'int32') 'int32')
self.true_pos = np.array(self.true_pos).astype('float32') self.true_pos = np.array(self.true_pos).astype('float32')
...@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): ...@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
def init_test_case(self): def init_test_case(self):
super(TestDetectionMAPOpMultiBatch, self).init_test_case() super(TestDetectionMAPOpMultiBatch, self).init_test_case()
self.class_pos_count = [0, 2, 1] self.class_pos_count = [0, 2, 1, 0]
self.true_pos_lod = [[0, 3, 2]] self.true_pos_lod = [[0, 3, 2]]
self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
self.false_pos_lod = [[0, 3, 2]] self.false_pos_lod = [[0, 3, 2]]
......
...@@ -18,23 +18,27 @@ import time ...@@ -18,23 +18,27 @@ import time
import unittest import unittest
import os import os
import sys import sys
import six
import signal import signal
import subprocess import subprocess
import six
import argparse import argparse
import paddle.fluid as fluid
RUN_STEP = 10
class TestDistRunnerBase(object): class TestDistRunnerBase(object):
def get_model(self, batch_size=2): def get_model(self, batch_size=2):
raise NotImplementedError( raise NotImplementedError(
"get_model should be implemented by child classes.") "get_model should be implemented by child classes.")
def get_transpiler(self, trainer_id, main_program, pserver_endpoints, @staticmethod
trainers, sync_mode): def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers,
sync_mode):
# NOTE: import fluid until runtime, or else forking processes will cause error. # NOTE: import fluid until runtime, or else forking processes will cause error.
import paddle config = fluid.DistributeTranspilerConfig()
import paddle.fluid as fluid t = fluid.DistributeTranspiler(config=config)
t = fluid.DistributeTranspiler()
t.transpile( t.transpile(
trainer_id=trainer_id, trainer_id=trainer_id,
program=main_program, program=main_program,
...@@ -44,9 +48,9 @@ class TestDistRunnerBase(object): ...@@ -44,9 +48,9 @@ class TestDistRunnerBase(object):
return t return t
def run_pserver(self, args): def run_pserver(self, args):
import paddle
import paddle.fluid as fluid
self.get_model(batch_size=2) self.get_model(batch_size=2)
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
...@@ -61,12 +65,10 @@ class TestDistRunnerBase(object): ...@@ -61,12 +65,10 @@ class TestDistRunnerBase(object):
exe.run(startup_prog) exe.run(startup_prog)
exe.run(pserver_prog) exe.run(pserver_prog)
def run_trainer(self, use_cuda, args): def run_trainer(self, args):
import paddle
import paddle.fluid as fluid
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2) self.get_model(batch_size=2)
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
if args.is_dist: if args.is_dist:
...@@ -74,16 +76,23 @@ class TestDistRunnerBase(object): ...@@ -74,16 +76,23 @@ class TestDistRunnerBase(object):
fluid.default_main_program(), fluid.default_main_program(),
args.endpoints, args.trainers, args.endpoints, args.trainers,
args.sync_mode) args.sync_mode)
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
else: else:
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program()) startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1 strategy.num_threads = 1
strategy.allow_op_delay = False strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy() build_stra = fluid.BuildStrategy()
if args.use_reduce: if args.use_reduce:
...@@ -92,7 +101,7 @@ class TestDistRunnerBase(object): ...@@ -92,7 +101,7 @@ class TestDistRunnerBase(object):
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
use_cuda, args.use_cuda,
loss_name=avg_cost.name, loss_name=avg_cost.name,
exec_strategy=strategy, exec_strategy=strategy,
build_strategy=build_stra) build_strategy=build_stra)
...@@ -103,27 +112,26 @@ class TestDistRunnerBase(object): ...@@ -103,27 +112,26 @@ class TestDistRunnerBase(object):
] ]
feeder = fluid.DataFeeder(feed_var_list, place) feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = test_reader() reader_generator = train_reader()
data = next(reader_generator)
first_loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(data))
print(first_loss)
for i in six.moves.xrange(5): def get_data():
data = next(reader_generator) origin_batch = next(reader_generator)
loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) if args.is_dist and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
new_batch.append(item)
return new_batch
else:
return origin_batch
data = next(reader_generator) for _ in six.moves.xrange(RUN_STEP):
last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) loss, = exe.run(fetch_list=[avg_cost.name],
print(last_loss) feed=feeder.feed(get_data()))
print(loss)
def runtime_main(test_class): def runtime_main(test_class):
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
parser = argparse.ArgumentParser(description='Run dist test.') parser = argparse.ArgumentParser(description='Run dist test.')
parser.add_argument( parser.add_argument(
'--role', type=str, required=True, choices=['pserver', 'trainer']) '--role', type=str, required=True, choices=['pserver', 'trainer'])
...@@ -135,7 +143,10 @@ def runtime_main(test_class): ...@@ -135,7 +143,10 @@ def runtime_main(test_class):
'--current_endpoint', type=str, required=False, default="") '--current_endpoint', type=str, required=False, default="")
parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--sync_mode', action='store_true')
parser.add_argument('--mem_opt', action='store_true') parser.add_argument('--mem_opt', action='store_true')
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--use_reduce', action='store_true')
parser.add_argument(
'--use_reader_alloc', action='store_true', required=False, default=True)
args = parser.parse_args() args = parser.parse_args()
...@@ -143,8 +154,7 @@ def runtime_main(test_class): ...@@ -143,8 +154,7 @@ def runtime_main(test_class):
if args.role == "pserver" and args.is_dist: if args.role == "pserver" and args.is_dist:
model.run_pserver(args) model.run_pserver(args)
else: else:
use_cuda = True if core.is_compiled_with_cuda() else False model.run_trainer(args)
model.run_trainer(use_cuda, args)
import paddle.compat as cpt import paddle.compat as cpt
...@@ -163,8 +173,10 @@ class TestDistBase(unittest.TestCase): ...@@ -163,8 +173,10 @@ class TestDistBase(unittest.TestCase):
self._find_free_port(), self._find_free_port()) self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True self._sync_mode = True
self._use_cuda = True
self._mem_opt = False self._mem_opt = False
self._use_reduce = False self._use_reduce = False
self._use_reader_alloc = True
self._setup_config() self._setup_config()
def _find_free_port(self): def _find_free_port(self):
...@@ -172,15 +184,15 @@ class TestDistBase(unittest.TestCase): ...@@ -172,15 +184,15 @@ class TestDistBase(unittest.TestCase):
s.bind(('', 0)) s.bind(('', 0))
return s.getsockname()[1] return s.getsockname()[1]
def start_pserver(self, model_file, check_error_log): def start_pserver(self, model_file, check_error_log, required_envs):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
ps0_cmd = ps_cmd % \ ps0_cmd = ps_cmd % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers)
ps1_cmd = ps_cmd % \ ps1_cmd = ps_cmd % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers)
if self._sync_mode: if self._sync_mode:
ps0_cmd += " --sync_mode" ps0_cmd += " --sync_mode"
...@@ -198,9 +210,15 @@ class TestDistBase(unittest.TestCase): ...@@ -198,9 +210,15 @@ class TestDistBase(unittest.TestCase):
ps1_pipe = open("/tmp/ps1_err.log", "wb") ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen( ps0_proc = subprocess.Popen(
ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) ps0_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=ps0_pipe,
env=required_envs)
ps1_proc = subprocess.Popen( ps1_proc = subprocess.Popen(
ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) ps1_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=ps1_pipe,
env=required_envs)
if not check_error_log: if not check_error_log:
return ps0_proc, ps1_proc, None, None return ps0_proc, ps1_proc, None, None
...@@ -222,59 +240,60 @@ class TestDistBase(unittest.TestCase): ...@@ -222,59 +240,60 @@ class TestDistBase(unittest.TestCase):
(e, retry_times)) (e, retry_times))
retry_times -= 1 retry_times -= 1
def check_with_place(self, model_file, delta=1e-3, check_error_log=False): def _run_local(self, model, envs, check_error_log):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
"CPU_NUM": "1"
}
if check_error_log: cmd = "%s %s --role trainer" % (self._python_interp, model)
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1" if self._use_cuda:
cmd += " --use_cuda"
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
else:
env_local = {'CPU_NUM': '1'}
envs.update(env_local)
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
if not check_error_log: if not check_error_log:
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
local_cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=err_log,
env=env_local) env=envs)
else: else:
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
local_cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=err_log, stderr=subprocess.PIPE,
env=env_local) env=envs)
local_proc.wait() local_proc.wait()
out, err = local_proc.communicate() local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(out) local_ret = cpt.to_text(local_out)
sys.stderr.write('local_loss: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % err) if check_error_log:
err_log.close()
sys.stderr.write('local_stdout: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % local_err)
local_losses = local_ret.split("\n")
return local_losses
def _run_cluster(self, model, envs, check_error_log):
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log) check_error_log, envs)
self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid) self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
tr0_cmd = tr_cmd % \ tr0_cmd = tr_cmd % \
(self._python_interp, model_file, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
0, ps0_ep, self._trainers) 0, ps0_ep, self._trainers)
tr1_cmd = tr_cmd % \ tr1_cmd = tr_cmd % \
(self._python_interp, model_file, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
1, ps1_ep, self._trainers) 1, ps1_ep, self._trainers)
if self._sync_mode: if self._sync_mode:
tr0_cmd += " --sync_mode" tr0_cmd += " --sync_mode"
...@@ -285,18 +304,28 @@ class TestDistBase(unittest.TestCase): ...@@ -285,18 +304,28 @@ class TestDistBase(unittest.TestCase):
if self._use_reduce: if self._use_reduce:
tr0_cmd += " --use_reduce" tr0_cmd += " --use_reduce"
tr1_cmd += " --use_reduce" tr1_cmd += " --use_reduce"
if self._use_reader_alloc:
tr0_cmd += " --use_reader_alloc"
tr1_cmd += " --use_reader_alloc"
if self._use_cuda:
tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
else:
env0 = {'CPU_NUM': '1'}
env1 = {'CPU_NUM': '1'}
env0.update(envs)
env1.update(envs)
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
FNULL = open(os.devnull, 'w') FNULL = open(os.devnull, 'w')
tr0_pipe = subprocess.PIPE tr0_pipe = subprocess.PIPE
tr1_pipe = subprocess.PIPE tr1_pipe = subprocess.PIPE
if check_error_log: if check_error_log:
print("tr0_cmd:", tr0_cmd) print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
print("tr1_cmd:", tr1_cmd) print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb") tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb")
...@@ -313,17 +342,11 @@ class TestDistBase(unittest.TestCase): ...@@ -313,17 +342,11 @@ class TestDistBase(unittest.TestCase):
tr0_proc.wait() tr0_proc.wait()
tr1_proc.wait() tr1_proc.wait()
out, err = tr0_proc.communicate()
sys.stderr.write('dist_stderr: %s\n' % err) tr0_out, tr0_err = tr0_proc.communicate()
loss_data0 = cpt.to_text(out) tr0_loss_text = cpt.to_text(tr0_out)
sys.stderr.write('dist_loss: %s\n' % loss_data0) tr1_out, tr1_err = tr1_proc.communicate()
lines = loss_data0.split("\n") tr1_loss_text = cpt.to_text(tr1_out)
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
# close trainer file # close trainer file
if check_error_log: if check_error_log:
...@@ -341,5 +364,47 @@ class TestDistBase(unittest.TestCase): ...@@ -341,5 +364,47 @@ class TestDistBase(unittest.TestCase):
ps1.wait() ps1.wait()
FNULL.close() FNULL.close()
self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) # print log
self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
tr0_losses = tr0_loss_text.split("\n")
tr1_losses = tr1_loss_text.split("\n")
return tr0_losses, tr1_losses
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
local_losses\
= self._run_local(model_file, required_envs,
check_error_log)
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
check_error_log)
for step_id in range(RUN_STEP):
local_loss = eval(local_losses[step_id])[0]
tr0_loss = eval(tr0_losses[step_id])[0]
tr1_loss = eval(tr1_losses[step_id])[0]
dist_loss = (tr0_loss + tr1_loss) / 2
print(str(local_loss) + ":" + str(dist_loss))
self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistCTR2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
if __name__ == "__main__":
unittest.main()
...@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase): ...@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase):
self._use_reduce = False self._use_reduce = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnist2x2WithMemopt(TestDistBase): class TestDistMnist2x2WithMemopt(TestDistBase):
...@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): ...@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._mem_opt = True self._mem_opt = True
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnistAsync(TestDistBase): class TestDistMnistAsync(TestDistBase):
......
...@@ -20,9 +20,10 @@ from test_dist_base import TestDistBase ...@@ -20,9 +20,10 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=100)
# TODO(typhoonzero): fix this test # TODO(typhoonzero): fix this test
...@@ -38,6 +39,7 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -38,6 +39,7 @@ class TestDistSeResneXt2x2(TestDistBase):
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=100)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistSimnetBowDense2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2DenseAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBowSparse2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2SparseAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistTextClassification2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_text_classification(self):
self.check_with_place("dist_text_classification.py", delta=1e-6)
class TestDistTextClassification2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_se_resnext(self):
self.check_with_place("dist_text_classification.py", delta=100)
if __name__ == "__main__":
unittest.main()
...@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest): ...@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest):
]) ])
class TestDecayedAdagrad(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
opt.minimize(avg_cost)
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer, _ = self.get_trainer()
class TestLRDecayConditional(TranspilerTest): class TestLRDecayConditional(TranspilerTest):
def net_conf(self): def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32') x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
......
...@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase): ...@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase):
self._sync_mode = False self._sync_mode = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1) self.check_with_place("dist_word2vec.py", delta=100)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest): ...@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest):
'eta': self.eta 'eta': self.eta
} }
print("lod = ", self.lod)
self.outputs = { self.outputs = {
'RpnRois': (self.rpn_rois[0], [self.lod]), 'RpnRois': (self.rpn_rois[0], [self.lod]),
'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]) 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
...@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest): ...@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest):
self.post_nms_topN = 5000 # train 6000, test 1000 self.post_nms_topN = 5000 # train 6000, test 1000
self.nms_thresh = 0.7 self.nms_thresh = 0.7
self.min_size = 3.0 self.min_size = 3.0
self.eta = 0.8 self.eta = 1.
def init_test_input(self): def init_test_input(self):
batch_size = 1 batch_size = 1
......
...@@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase): ...@@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase):
mul_op_desc.set_input("X", ["x"]) mul_op_desc.set_input("X", ["x"])
mul_op_desc.set_input("Y", ["y"]) mul_op_desc.set_input("Y", ["y"])
mul_op_desc.set_output("Out", ["out"]) mul_op_desc.set_output("Out", ["out"])
mul_op_desc.set_attr("x_num_col_dims", 1) mul_op_desc._set_attr("x_num_col_dims", 1)
mul_op_desc.set_attr("y_num_col_dims", 1) mul_op_desc._set_attr("y_num_col_dims", 1)
mul_op_desc.check_attrs() mul_op_desc.check_attrs()
mul_op_desc.infer_shape(block) mul_op_desc.infer_shape(block)
......
...@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase): ...@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase):
with program_guard(program): with program_guard(program):
input = layers.data( input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32") name="input", shape=[3, 100, 100], dtype="float32")
out = layers.shape(input, name="shape") out = layers.shape(input)
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
...@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase): ...@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase):
out = layers.expand(x, [1, 2]) out = layers.expand(x, [1, 2])
print(str(program)) print(str(program))
def test_uniform_random_batch_size_like(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.uniform_random_batch_size_like(input, [-1, 11])
self.assertIsNotNone(out)
print(str(program))
def test_gaussian_random(self):
program = Program()
with program_guard(program):
out = layers.gaussian_random(shape=[20, 30])
self.assertIsNotNone(out)
print(str(program))
def test_sampling_id(self):
program = Program()
with program_guard(program):
x = layers.data(
name="X",
shape=[13, 11],
dtype='float32',
append_batch_size=False)
out = layers.sampling_id(x)
self.assertIsNotNone(out)
print(str(program))
def test_gaussian_random_batch_size_like(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.gaussian_random_batch_size_like(
input, shape=[-1, 11], mean=1.0, std=2.0)
self.assertIsNotNone(out)
print(str(program))
def test_sum(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.sum(input)
self.assertIsNotNone(out)
print(str(program))
def test_slice(self):
starts = [1, 0, 2]
ends = [3, 3, 4]
axes = [0, 1, 2]
program = Program()
with program_guard(program):
input = layers.data(
name="input", shape=[3, 4, 5, 6], dtype='float32')
out = layers.slice(input, axes=axes, starts=starts, ends=ends)
def test_softshrink(self): def test_softshrink(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase): ...@@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(['z'], op.output("Out")) self.assertEqual(['z'], op.output("Out"))
self.assertEqual(["Out"], op.output_names()) self.assertEqual(["Out"], op.output_names())
op.set_attr("int_attr", 1) op._set_attr("int_attr", 1)
self.assertEqual(1, op.attr("int_attr")) self.assertEqual(1, op.attr("int_attr"))
self.assertTrue(op.has_attr("int_attr")) self.assertTrue(op.has_attr("int_attr"))
self.assertEqual(core.AttrType.INT, op.attr_type("int_attr")) self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
op.set_attr("float_attr", -1.32) op._set_attr("float_attr", -1.32)
self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4) self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
self.assertTrue(op.has_attr("float_attr")) self.assertTrue(op.has_attr("float_attr"))
op.set_attr("bool_attr", False) op._set_attr("bool_attr", False)
self.assertFalse(op.attr("bool_attr")) self.assertFalse(op.attr("bool_attr"))
op.set_attr("string_attr", "abc") op._set_attr("string_attr", "abc")
self.assertEqual("abc", op.attr("string_attr")) self.assertEqual("abc", op.attr("string_attr"))
self.assertTrue(op.has_attr("string_attr")) self.assertTrue(op.has_attr("string_attr"))
op.set_attr("ints_attr", [1, 2, 3]) op._set_attr("ints_attr", [1, 2, 3])
self.assertEqual([1, 2, 3], op.attr("ints_attr")) self.assertEqual([1, 2, 3], op.attr("ints_attr"))
expected = [1.2, 2.3, 3.4] expected = [1.2, 2.3, 3.4]
op.set_attr("floats_attr", expected) op._set_attr("floats_attr", expected)
for e, a in zip(expected, op.attr("floats_attr")): for e, a in zip(expected, op.attr("floats_attr")):
self.assertAlmostEqual(e, a, delta=1e-4) self.assertAlmostEqual(e, a, delta=1e-4)
op.set_attr("strings_attr", ["a", "b", "c"]) op._set_attr("strings_attr", ["a", "b", "c"])
self.assertEqual(["a", "b", "c"], op.attr("strings_attr")) self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))
op.set_attr("bools_attr", [True, False, True]) op._set_attr("bools_attr", [True, False, True])
self.assertEqual([True, False, True], op.attr("bools_attr")) self.assertEqual([True, False, True], op.attr("bools_attr"))
self.assertEqual(8, len(op.attr_names())) self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", program_desc.block(0)) op.set_block_attr("_block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr_id("block_attr")) self.assertEqual(0, op._block_attr_id("_block_attr"))
mul_op = block.append_op() mul_op = block.append_op()
mul_op.set_type("mul") mul_op.set_type("mul")
......
...@@ -128,7 +128,7 @@ def op_to_code(op): ...@@ -128,7 +128,7 @@ def op_to_code(op):
attr_type = op.desc.attr_type(name) attr_type = op.desc.attr_type(name)
if attr_type == core.AttrType.BLOCK: if attr_type == core.AttrType.BLOCK:
a = "{name} = block[{value}]".format( a = "{name} = block[{value}]".format(
name=name, type=attr_type, value=op.block_attr_id(name)) name=name, type=attr_type, value=op._block_attr_id(name))
attrs_str += a attrs_str += a
if i != len(attr_names) - 1: if i != len(attr_names) - 1:
attrs_str += ", " attrs_str += ", "
...@@ -136,7 +136,7 @@ def op_to_code(op): ...@@ -136,7 +136,7 @@ def op_to_code(op):
if attr_type == core.AttrType.BLOCKS: if attr_type == core.AttrType.BLOCKS:
a = "{name} = blocks{value}".format( a = "{name} = blocks{value}".format(
name=name, type=attr_type, value=op.blocks_attr_ids(name)) name=name, type=attr_type, value=op._blocks_attr_ids(name))
attrs_str += a attrs_str += a
if i != len(attr_names) - 1: if i != len(attr_names) - 1:
attrs_str += ", " attrs_str += ", "
......
...@@ -39,8 +39,8 @@ import six ...@@ -39,8 +39,8 @@ import six
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, Block, \ default_startup_program, Block, \
Parameter, grad_var_name Parameter, grad_var_name
from .details import * from .details import *
from functools import reduce from functools import reduce
...@@ -178,7 +178,7 @@ class DistributeTranspiler(object): ...@@ -178,7 +178,7 @@ class DistributeTranspiler(object):
pserver_program) pserver_program)
elif role == "TRAINER": elif role == "TRAINER":
trainer_program = t.get_trainer_program() trainer_program = t.get_trainer_program()
# for nccl2 mode # for nccl2 mode
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2" config.mode = "nccl2"
...@@ -470,7 +470,10 @@ class DistributeTranspiler(object): ...@@ -470,7 +470,10 @@ class DistributeTranspiler(object):
""" """
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
lr_ops = self._get_lr_ops()
delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
delete_ops(self.origin_program.global_block(), lr_ops)
self.origin_program.__str__() self.origin_program.__str__()
if wait_port: if wait_port:
...@@ -534,7 +537,7 @@ class DistributeTranspiler(object): ...@@ -534,7 +537,7 @@ class DistributeTranspiler(object):
}) })
for varname, splited_var in six.iteritems(self.param_var_mapping): for varname, splited_var in six.iteritems(self.param_var_mapping):
#add concat ops to merge splited parameters received from parameter servers. # add concat ops to merge splited parameters received from parameter servers.
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
# NOTE: if enable memory optimization, origin vars maybe removed. # NOTE: if enable memory optimization, origin vars maybe removed.
...@@ -668,7 +671,7 @@ in a single call.") ...@@ -668,7 +671,7 @@ in a single call.")
__clone_lr_op_sub_block__(cloned_op, program, new_sub_block) __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
# reset the block of op # reset the block of op
op.set_attr('sub_block', new_sub_block) op._set_attr('sub_block', new_sub_block)
# append lr decay ops to the child block if exists # append lr decay ops to the child block if exists
lr_ops = self._get_lr_ops() lr_ops = self._get_lr_ops()
...@@ -734,19 +737,14 @@ in a single call.") ...@@ -734,19 +737,14 @@ in a single call.")
table_opt_block = self._create_table_optimize_block( table_opt_block = self._create_table_optimize_block(
pserver_index, pserver_program, pre_block_idx, grad_to_block_id) pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
optimize_blocks.append(table_opt_block) optimize_blocks.append(table_opt_block)
prefetch_var_name_to_block_id = self._create_prefetch_block( lookup_table_var_name_to_block_id = self._create_prefetch_block(
pserver_index, pserver_program, table_opt_block) pserver_index, pserver_program, table_opt_block)
checkpoint_block_id = self._create_checkpoint_save_block( checkpoint_block_id = self._create_checkpoint_save_block(
pserver_program, table_opt_block.idx) pserver_program, table_opt_block.idx)
pserver_program._distributed_lookup_table = self.table_name pserver_program._distributed_lookup_table = self.table_name
prefetch_var_name_to_block_id.extend(
# NOTE: if has_distributed_lookup_table is False, then prefetch_block will lookup_table_var_name_to_block_id)
# not be executed, so it's safe to use optimize_block to hold the place
if self.has_distributed_lookup_table:
assert len(prefetch_var_name_to_block_id) > 0
else:
assert len(prefetch_var_name_to_block_id) == 0
attrs = { attrs = {
"optimize_blocks": optimize_blocks, "optimize_blocks": optimize_blocks,
...@@ -755,11 +753,14 @@ in a single call.") ...@@ -755,11 +753,14 @@ in a single call.")
"sync_mode": self.sync_mode, "sync_mode": self.sync_mode,
"grad_to_block_id": grad_to_block_id, "grad_to_block_id": grad_to_block_id,
} }
if len(prefetch_var_name_to_block_id) > 0:
attrs['prefetch_var_name_to_block_id'] \ if self.has_distributed_lookup_table:
= prefetch_var_name_to_block_id
attrs['checkpint_block_id'] = checkpoint_block_id attrs['checkpint_block_id'] = checkpoint_block_id
if len(prefetch_var_name_to_block_id) > 0:
attrs[
'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
# step5 append the listen_and_serv op # step5 append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="listen_and_serv", type="listen_and_serv",
...@@ -864,7 +865,7 @@ to transpile() call.") ...@@ -864,7 +865,7 @@ to transpile() call.")
if op.type in [ if op.type in [
"gaussian_random", "fill_constant", "uniform_random" "gaussian_random", "fill_constant", "uniform_random"
]: ]:
op.set_attr("shape", list(new_outputs["Out"].shape)) op._set_attr("shape", list(new_outputs["Out"].shape))
s_prog.global_block().append_op( s_prog.global_block().append_op(
type=op.type, type=op.type,
inputs=new_inputs, inputs=new_inputs,
...@@ -1013,7 +1014,7 @@ to transpile() call.") ...@@ -1013,7 +1014,7 @@ to transpile() call.")
for g, p in zip(grad_blocks, param_blocks): for g, p in zip(grad_blocks, param_blocks):
g_name, g_bid, _ = g.split(":") g_name, g_bid, _ = g.split(":")
p_name, p_bid, _ = p.split(":") p_name, p_bid, _ = p.split(":")
self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
self.param_var_mapping[p_name][int(p_bid)] self.param_var_mapping[p_name][int(p_bid)]
# create mapping of endpoint -> split var to create pserver side program # create mapping of endpoint -> split var to create pserver side program
...@@ -1320,7 +1321,7 @@ to transpile() call.") ...@@ -1320,7 +1321,7 @@ to transpile() call.")
if len(splited) == 1: if len(splited) == 1:
if self.sync_mode and add_trainer_suffix: if self.sync_mode and add_trainer_suffix:
new_var_name = "%s.trainer_%d" % \ new_var_name = "%s.trainer_%d" % \
(orig_var.name, self.trainer_id) (orig_var.name, self.trainer_id)
program.global_block()._rename_var(varname, new_var_name) program.global_block()._rename_var(varname, new_var_name)
var_mapping[varname] = \ var_mapping[varname] = \
[program.global_block().var(new_var_name)] [program.global_block().var(new_var_name)]
...@@ -1343,10 +1344,10 @@ to transpile() call.") ...@@ -1343,10 +1344,10 @@ to transpile() call.")
new_var_name = "" new_var_name = ""
if self.sync_mode and add_trainer_suffix: if self.sync_mode and add_trainer_suffix:
new_var_name = "%s.block%d.trainer_%d" % \ new_var_name = "%s.block%d.trainer_%d" % \
(varname, i, self.trainer_id) (varname, i, self.trainer_id)
else: else:
new_var_name = "%s.block%d" % \ new_var_name = "%s.block%d" % \
(varname, i) (varname, i)
var = program.global_block().create_var( var = program.global_block().create_var(
name=new_var_name, name=new_var_name,
persistable=False, persistable=False,
...@@ -1430,6 +1431,9 @@ to transpile() call.") ...@@ -1430,6 +1431,9 @@ to transpile() call.")
elif op_type == "rmsprop": elif op_type == "rmsprop":
if varkey in ["Moment", "MeanSquare"]: if varkey in ["Moment", "MeanSquare"]:
return param_shape return param_shape
elif op_type == "decayed_adagrad":
if varkey == "Moment":
return param_shape
elif op_type == "sgd": elif op_type == "sgd":
pass pass
return orig_shape return orig_shape
...@@ -1484,9 +1488,8 @@ to transpile() call.") ...@@ -1484,9 +1488,8 @@ to transpile() call.")
vars2merge = [] vars2merge = []
for i in range(self.trainer_num): for i in range(self.trainer_num):
per_trainer_name = "%s.trainer_%d" % \ per_trainer_name = "%s.trainer_%d" % \
(merged_var_name, i) (merged_var_name, i)
vars2merge.append(pserver_block.vars[per_trainer_name]) vars2merge.append(pserver_block.vars[per_trainer_name])
optimize_block.append_op( optimize_block.append_op(
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
...@@ -1645,7 +1648,7 @@ to transpile() call.") ...@@ -1645,7 +1648,7 @@ to transpile() call.")
# one op's output is another op's input, we say # one op's output is another op's input, we say
# the two operator is connected. # the two operator is connected.
if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \ if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
return True return True
return False return False
...@@ -1662,7 +1665,7 @@ to transpile() call.") ...@@ -1662,7 +1665,7 @@ to transpile() call.")
def _is_optimizer_op(self, op): def _is_optimizer_op(self, op):
if "Param" in op.input_names and \ if "Param" in op.input_names and \
"LearningRate" in op.input_names: "LearningRate" in op.input_names:
return True return True
return False return False
...@@ -1737,7 +1740,7 @@ to transpile() call.") ...@@ -1737,7 +1740,7 @@ to transpile() call.")
# NOTE: we need to skip all optimize ops, since it is connected # NOTE: we need to skip all optimize ops, since it is connected
# with forward/backward ops and lr ops, we only need the lr ops. # with forward/backward ops and lr ops, we only need the lr ops.
if op1 != op2 and self._is_op_connected(op1, op2) and \ if op1 != op2 and self._is_op_connected(op1, op2) and \
not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
ufind.union(op1, op2) ufind.union(op1, op2)
# find all ops which is related with lr var # find all ops which is related with lr var
for op1 in block.ops: for op1 in block.ops:
......
...@@ -163,7 +163,7 @@ class InferenceTranspiler(object): ...@@ -163,7 +163,7 @@ class InferenceTranspiler(object):
next_op = self.block.ops[i + 1] next_op = self.block.ops[i + 1]
if next_op.type == 'relu': if next_op.type == 'relu':
# modify bnorm OP to include relu # modify bnorm OP to include relu
current_op.set_attr("fuse_with_relu", True) current_op._set_attr("fuse_with_relu", True)
# remove relu OP # remove relu OP
self.block._remove_op(i + 1) self.block._remove_op(i + 1)
i = i + 1 i = i + 1
...@@ -377,7 +377,7 @@ class InferenceTranspiler(object): ...@@ -377,7 +377,7 @@ class InferenceTranspiler(object):
type=old_var.type, type=old_var.type,
dtype=old_var.dtype, dtype=old_var.dtype,
shape=old_var.shape) shape=old_var.shape)
op.rename_input(old_param_name, new_param_name) op._rename_input(old_param_name, new_param_name)
self.scope.var(new_param_name) self.scope.var(new_param_name)
tensor = self.scope.find_var(new_param_name).get_tensor() tensor = self.scope.find_var(new_param_name).get_tensor()
...@@ -463,8 +463,8 @@ class InferenceTranspiler(object): ...@@ -463,8 +463,8 @@ class InferenceTranspiler(object):
current_op = self.block.ops[i] current_op = self.block.ops[i]
for input_arg in current_op.input_arg_names: for input_arg in current_op.input_arg_names:
if input_arg in self.input_map: if input_arg in self.input_map:
current_op.rename_input(input_arg, current_op._rename_input(input_arg,
self.input_map[input_arg]) self.input_map[input_arg])
def _remove_unused_var(self): def _remove_unused_var(self):
''' '''
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册